# SKLearn Predict Analysis

I've loaded our LR model that uses 500k samples so that we can feed text into our LR model and compare results with LSTM




In [111]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


import pandas as pd
import numpy as np

from nltk import WordPunctTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.metrics import classification_report, confusion_matrix
import util.text_util as tu


model_filename = '../models/review_body-tfidf-df_none-ngram13-497835-4000-nolda-sampling_none-LRB-star_rating.jbl'
data_file_prerocessed = '../dataset/feature_files/review_body-tfidf-df_none-ngram13-497835-4000-nolda.csv'
data_file = '../dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-2m-preprocessed.csv'


LABEL_COLUMN = 'star_rating'
FEATURE_COLUMN = 'review_body'
DROP_COLUMNS = ['helpful_votes', 'total_votes', 'helpful_product']
RSTATE = 1
MAX_FEATURES = 4000


STOP_WORDS_TO_REMOVE=[
    'no',
    'not',
    'do',
    'don',
    "don't",
    'does',
    'did',
    'does',
    'doesn',
    "doesn't",
    'should',
    'very',
    'will'
    ]


# remove these from NLTK stop words
tu.remove_stop_words_from_list(STOP_WORDS_TO_REMOVE)


In [35]:
data = pd.read_csv(data_file)

Looks like we have some reviews with 0 length

In [37]:
data["reviews_wc"] = data[FEATURE_COLUMN].apply(lambda x: 0 if type(x) == float else len(x.split()))
data = data.dropna(subset=["review_body"], axis=0)

data.describe()

Unnamed: 0.1,Unnamed: 0,star_rating,helpful_votes,total_votes,reviews_wc
count,2000000.0,2000000.0,2000000.0,2000000.0,2000000.0
mean,4479400.0,3.891887,0.8867985,1.153611,25.91794
std,2588526.0,1.463923,11.2533,12.03111,44.27699
min,9.0,1.0,0.0,0.0,1.0
25%,2235681.0,3.0,0.0,0.0,8.0
50%,4478210.0,5.0,0.0,0.0,15.0
75%,6723105.0,5.0,0.0,1.0,28.0
max,8960795.0,5.0,3914.0,4022.0,4686.0


In [39]:
labels = data[LABEL_COLUMN]
reviews = data[FEATURE_COLUMN]

# Re-create Vectorizer

In [41]:
tv = TfidfVectorizer(min_df=1,
                     max_df=1.,
                     ngram_range=(1,3),
                     max_features=MAX_FEATURES,
                     use_idf=True
                     )
tv = tv.fit(reviews.array)
vocab = tv.get_feature_names()



# Load the Model

In [44]:
with open(model_filename, 'rb') as file:
    model = joblib.load(model_filename)



In [141]:
RESULT1_INDEX = 0
RESULT2_INDEX = 1
RESULT3_INDEX = 2
RESULT4_INDEX = 3
RESULT5_INDEX = 4

column_mapper = {0: 1,
                                                            1: 2,
                                                            2: 3,
                                                            3: 4,
                                                            4: 5
                                                        }

def normalize_text(text):
    text = tu.remove_stop_words(text)
    text = tu.lemmatize_text(text)
    return text

def predict(model, reviews: list):
    """
    :param reviews: list of pre-processed text reviews
    :return: predictions (# reviews x 1), features
    """
    coef_pd_list = []
    
    # put reviews in pd then remove stopwords and lemmatize so it's identical to LR features
    reviews_df = pd.DataFrame(reviews).rename({0: "review_body_orig"}, axis=1)
    reviews_df["review_body"] = reviews_df["review_body_orig"].apply(lambda x: normalize_text(x))
#     print(reviews_pd.head())

    # coef_def is 1 x # Features
    coef_df = pd.DataFrame(model.coef_, columns=vocab)
    # intercept is 5 x 1 matrix
    intercept_df = pd.DataFrame(model.intercept_).rename({0: "intercept"}, axis=1)

    tv_matrix = tv.transform(reviews_df["review_body"].array)
    features = pd.DataFrame(np.round(tv_matrix.toarray(), 2), columns=vocab)
    
    for i, row in features.iterrows():
        # multiple coeffiecients with feature value
#         print(row_pd.loc[:, (row_pd != 0).any(axis=0)])

        row_coef = coef_df.copy()
        for si, value in row.iteritems():
            row_coef[si] = row_coef[si] * value
        # drop any columns that are 0
        row_coef = row_coef.loc[:, (row_coef != 0).any(axis=0)]
        row_coef = row_coef.join(intercept_df)
        row_coef["final_value"] = row_coef.sum(axis=1)
#         print(f'row_coef: {row_coef}')
        
        coef_pd_list.append(row_coef)
    
    predictions = model.predict(features)
    
    for i in np.arange(0, len(predictions)):
        print(f'\n\n\nReview (Orig): {reviews_df.loc[i, "review_body_orig"]}')
        print(f'\nReview (Normalized): {reviews_df.loc[i, "review_body"]}')
        print(f'Coefficients:\n{coef_pd_list[i].T.rename(column_mapper, axis=1)}')
        print(f'\nPrediction: {predictions[i]}\n')
    
    
    return reviews_df, predictions, coef_pd_list

# test our function

# test_reviews = ['test review']
# test_reviews_pd = pd.DataFrame(test_reviews).rename({0: "reviews"}, axis=1)
# reviews_df, predictions, coef_pd_list = predict(model, test_reviews_pd["reviews"].array)


# LSTM 4-Star Review Misclassified as 5 (1)

Index: 851267

Our Prediction is: 2 (we are way off)

In [142]:
review_temp = ['have to say this thing is what i thought it would be not the best internet but on my it gets the job done',
#                'have to say this thing is what i thought it would be',
#                'gets the job done', 
#                'not the best',
#                'not the best internet',
#                'not the best but gets the job done',
#                'not the best internet but on my gets the job done',
#                'on my'
               ]

# test_reviews_pd = pd.DataFrame(test_reviews).rename({0: "reviews"}, axis=1)
reviews_df, predictions, test_feature_coefs = predict(model, review_temp)






Review (Orig): have to say this thing is what i thought it would be not the best internet but on my it gets the job done

Review (Normalized): say thing thought would not best internet get job done
Coefficients:
                      1         2         3         4         5
best          -0.893184 -0.379014 -0.202679  0.446039  1.028839
done          -0.034628  0.057986 -0.024925  0.004232 -0.002665
get            0.023715  0.051308 -0.092286  0.033845 -0.016583
get job       -0.196320 -0.173084 -0.042801  0.159253  0.252953
get job done  -0.023283  0.196947 -0.149545 -0.085151  0.061033
internet      -0.072925 -0.057645 -0.171028  0.011826  0.289772
job           -0.197718  0.048576 -0.044752  0.066619  0.127275
job done       1.701417  0.419250 -0.677010 -0.736610 -0.707047
not           -0.174793  0.196185  0.244725  0.043348 -0.309465
not best       0.607569  0.373076  0.189211 -0.356886 -0.812970
say           -0.031799  0.001088 -0.060553  0.032697  0.058567
thing          0.

# LSTM 4-Star Review Misclassified as 5 (2)

Index: 338372

**Our Prediction is: 4 (it is correct but 5 is pretty close)**

interesting annoy is not in our feature list for LR

In [143]:
review_temp = ['i like this screen protector as the installation was easy enough there is only one bubble on it which annoys me but it looks good nonetheless',
#                'i like this screen protector as the installation was easy enough',
#                'there is only one bubble on it which annoys me', 
#                'it looks good',
#                'but it looks good',
#                'i like this screen protector as the installation was easy enough there is only one bubble on it which annoys me',
#                'there is only one bubble on it which annoys me but it looks good',
#                'but',
#                'like',
#                'easy'
]
reviews_df, predictions, test_feature_coefs = predict(model, review_temp)





Review (Orig): i like this screen protector as the installation was easy enough there is only one bubble on it which annoys me but it looks good nonetheless

Review (Normalized): like screen protector installation easy enough one bubble annoys look good nonetheless
Coefficients:
                         1         2         3         4         5
bubble           -0.139927 -0.095662  0.073783  0.090287  0.071519
easy             -0.279118 -0.268486 -0.058702  0.295980  0.310325
enough            0.063446 -0.157488 -0.004105  0.014745  0.083402
good             -0.160044 -0.293983  0.007104  0.316910  0.130013
installation     -0.037506 -0.078353 -0.144626  0.123617  0.136867
like             -0.095778  0.060019 -0.013265  0.002954  0.046070
like screen       0.090610 -0.013407 -0.111146  0.024240  0.009703
look             -0.091722 -0.047828  0.031304  0.078853  0.029393
look good        -0.195883  0.123522 -0.014323  0.036646  0.050038
one              -0.014068 -0.073610  0.003853 

# 5-Star misclassifed as 4-Star

Index: 329502

**LR Prediction: 5 (correct)**

In [144]:
review_temp = ['must have for techies all the basic stuff for phone tablet and laptop disassembly the drivers could be better made but no top quality manufacturer is going to challenge apples proprietary pentalobe it is a good quality kit for a great price hence my 5 star rating',
#                'must have for techies all the basic stuff for phone tablet and laptop disassembly',
#                'the drivers could be better made',
#                'but no top quality manufacturer is going to challenge apples proprietary pentalobe',
#                'no top quality manufacturer is going to challenge apples proprietary pentalobe',
#                'it is a good quality kit for a great price hence my 5 star rating',
#                'no',
#                'but',
#                'but no',
#                'could be better'
]
reviews_df, predictions, test_feature_coefs = predict(model, review_temp)





Review (Orig): must have for techies all the basic stuff for phone tablet and laptop disassembly the drivers could be better made but no top quality manufacturer is going to challenge apples proprietary pentalobe it is a good quality kit for a great price hence my 5 star rating

Review (Normalized): must techie basic stuff phone tablet laptop disassembly driver could better made no top quality manufacturer going challenge apple proprietary pentalobe good quality kit great price hence 5 star rating
Coefficients:
                     1         2         3         4         5
apple        -0.014297  0.081611  0.065976 -0.003165 -0.130124
basic        -0.197469 -0.129756  0.294806  0.182655 -0.150236
better       -0.232552  0.079765  0.084512  0.033223  0.035051
could         0.072086  0.031066  0.070350 -0.008191 -0.165311
could better -0.195091 -0.178883 -0.224488  0.008541  0.589920
driver       -0.055561 -0.073981  0.005183  0.018979  0.105380
going         0.014359 -0.082250 -0.077

# 2-Star Misclassified as 1-Star

LR Prediction: 4 (incorrect)


In [145]:
review_temp = ['nice but the only problem i have is that the belt hoops are too small for most of my belts',
#                'nice',
#                'the only problem i have is that the belt hoops are too small for most of my belts',
#                'the belt hoops are too small',
#                'but the only problem i have is that the belt hoops are too small for most of my belts',
#                'but',
#                'problem',
#                'the only problem',
#                'too small'               
              ]
reviews_df, predictions, test_feature_coefs = predict(model, review_temp)





Review (Orig): nice but the only problem i have is that the belt hoops are too small for most of my belts

Review (Normalized): nice problem belt hoop small belt
Coefficients:
                    1         2         3         4         5
belt        -0.303695  0.136244 -0.041256  0.333370 -0.124663
nice        -0.074445 -0.017849 -0.137767  0.063525  0.166537
problem      0.066083  0.203854 -0.026924 -0.052574 -0.190439
small       -0.279121 -0.257232  0.000310  0.153702  0.382342
intercept    0.123724  0.080674  0.197804 -0.189372 -0.212831
final_value -0.467454  0.145692 -0.007833  0.308650  0.020945

Prediction: 4

