In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, accuracy_score

from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import xgboost as xgb


Join both review and metadata datasets from zip file

In [2]:
reviewContent = pd.read_csv("ReviewContent", names = ['user_id', 'prod_id', 'date', 'review'], sep = '\t', parse_dates = ['date'])
metadata = pd.read_csv("metadata",names=["user_id", "prod_id", "rating", "label", "date"], sep = '\t', parse_dates = ['date'])



reviews = pd.merge(reviewContent, metadata, how = 'left', 
                    left_on = ['user_id', 'prod_id', 'date'], right_on = ['user_id', 'prod_id', 'date'])

In [3]:
reviews.head()

Unnamed: 0,user_id,prod_id,date,review,rating,label
0,5044,0,2014-11-16,"Drinks were bad, the hot chocolate was watered...",1.0,-1
1,5045,0,2014-09-08,This was the worst experience I've ever had a ...,1.0,-1
2,5046,0,2013-10-06,This is located on the site of the old Spruce ...,3.0,-1
3,5047,0,2014-11-30,I enjoyed coffee and breakfast twice at Toast ...,5.0,-1
4,5048,0,2014-08-28,I love Toast! The food choices are fantastic -...,5.0,-1


In [4]:
reviews[reviews['user_id'] == 923]

Unnamed: 0,user_id,prod_id,date,review,rating,label


In [5]:
# Convert classification of fake reviews from '-1' to '0'

reviews['label'] = reviews['label'].apply(lambda x: 0 if x == -1 else x)

In [6]:
reviews['label'].value_counts()

1    528019
0     80439
Name: label, dtype: int64

## Model 1: Experiment with using only review text to predict rating (w/o feature engineering)

In [7]:
model1 = reviews[['review', 'label']]

In [8]:
model1.head()

Unnamed: 0,review,label
0,"Drinks were bad, the hot chocolate was watered...",0
1,This was the worst experience I've ever had a ...,0
2,This is located on the site of the old Spruce ...,0
3,I enjoyed coffee and breakfast twice at Toast ...,0
4,I love Toast! The food choices are fantastic -...,0


In [9]:
# Calculate baseline

baseline = model1['label'].mean()
baseline

0.8677985990816128

If any review is drawn by chance, there is a 86.8% probability that it is a real review. 

try no class imbalance first

**Model 1.1: Using SVM**

In [11]:
X = reviews['review']
y = reviews['label']

In [12]:
X.head()

0    Drinks were bad, the hot chocolate was watered...
1    This was the worst experience I've ever had a ...
2    This is located on the site of the old Spruce ...
3    I enjoyed coffee and breakfast twice at Toast ...
4    I love Toast! The food choices are fantastic -...
Name: review, dtype: object

In [None]:
# preprocessing of text data (ngram = 2)
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

t1 = datetime.now()
cvec = CountVectorizer(ngram_range = (2,2), min_df = 3, stop_words = 'english')
Xs = cvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs.shape)

In [47]:
# Test using svc_classifier

from sklearn.svm import SVC, LinearSVC

svc_classifier = SVC()
score = cross_val_score(svc_classifier, Xs, y, cv = 2, n_jobs = -1)

print(datetime.now() - t1)
print(score)

0:11:45.842323
[0.80437498 0.80851532]


In [12]:
# preprocessing of text data (ngram = 1)



t1 = datetime.now()
cvec = CountVectorizer(ngram_range = (1, 1), min_df = 3, stop_words = 'english')
Xs1 = cvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs1.shape)

0:02:00.203980
(608458, 66410)


In [None]:
# Test using svc_classifier

from sklearn.svm import SVC, LinearSVC

svc_classifier = SVC()
score = cross_val_score(svc_classifier, Xs1, y, cv = 2, n_jobs = -1)

print(datetime.now() - t1)
print(score)

In [50]:
# preprocessing of text data (ngram = 1, 2)

t1 = datetime.now()
cvec = CountVectorizer(ngram_range = (1, 2), min_df = 3, stop_words = 'english')
Xs1 = cvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs.shape)

0:02:56.721422
(608458, 1446832)


In [51]:
# Test using svc_classifier

svc_classifier = LinearSVC()
score = cross_val_score(svc_classifier, Xs1, y, cv = 2, n_jobs = -1)

print(datetime.now() - t1)
print(score)

0:06:14.809903
[0.80677777 0.81179576]


In [52]:
# preprocessing of text data (ngram = 1, 2, remove stop_words)

t1 = datetime.now()
cvec = CountVectorizer(ngram_range = (1, 2), min_df = 3)
Xs1 = cvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs1.shape)

0:04:00.054039
(608458, 1446832)


In [54]:
# Test using svc_classifier

svc_classifier = LinearSVC()
score = cross_val_score(svc_classifier, Xs1, y, cv = 2, n_jobs = -1)

print(Xs1.shape)
print(datetime.now() - t1)
print(score)

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

In [42]:
cvec.get_feature_names()

['00 00',
 '00 10',
 '00 11',
 '00 12',
 '00 15',
 '00 20',
 '00 24',
 '00 30',
 '00 30pm',
 '00 50',
 '00 99',
 '00 about',
 '00 add',
 '00 added',
 '00 after',
 '00 again',
 '00 all',
 '00 almost',
 '00 also',
 '00 although',
 '00 am',
 '00 an',
 '00 and',
 '00 another',
 '00 anyway',
 '00 appetizer',
 '00 appetizers',
 '00 are',
 '00 as',
 '00 asked',
 '00 at',
 '00 awesome',
 '00 back',
 '00 be',
 '00 because',
 '00 beer',
 '00 beers',
 '00 before',
 '00 believe',
 '00 best',
 '00 between',
 '00 bill',
 '00 both',
 '00 bottle',
 '00 bottles',
 '00 bowl',
 '00 brunch',
 '00 bucks',
 '00 burger',
 '00 but',
 '00 by',
 '00 calamari',
 '00 came',
 '00 can',
 '00 card',
 '00 change',
 '00 charge',
 '00 cheap',
 '00 cheaper',
 '00 check',
 '00 chicken',
 '00 chose',
 '00 clock',
 '00 cocktails',
 '00 comes',
 '00 corkage',
 '00 could',
 '00 couple',
 '00 coupon',
 '00 crazy',
 '00 credit',
 '00 cup',
 '00 deal',
 '00 decided',
 '00 definitely',
 '00 delivery',
 '00 depending',
 '00 did',

**Model 1.2: Using Logistic Regression**

In [55]:
# preprocessing of text data

t1 = datetime.now()
cvec = CountVectorizer(ngram_range = (1, 1), min_df = 3, stop_words = 'english')
Xs1 = cvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs1.shape)

0:01:24.494791
(608458, 66410)


In [57]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(n_jobs = -1)
score = cross_val_score(logreg, Xs1, y, cv = 2, n_jobs = -1)

print(datetime.now() - t1)
print(score)

0:06:49.827909
[0.85830457 0.85970391]


**Model 1.3: Using RandomForestClassifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 200, random_state = 35)
score = cross_val_score(rfc, Xs1, y, cv = 2, n_jobs = -1)

print(datetime.now() - t1)
print(score)

**Model 1.4: Using xgboost**

In [59]:

xgb_classifier = xgb.XGBClassifier()
score = cross_val_score(xgb_classifier, Xs1, y, cv = 2, n_jobs = -1)
print('Cross Validation Score: {:0.3} ± {:0.3}'.format(score.mean().round(3), score.std().round(3)))

print(datetime.now() - t1)
print(score)

0:14:07.851004
[0.86780068 0.86781296]


In [None]:
# split data into training and testing datasets

Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, y, test_size = 0.3)

In [None]:
xgb_classifier.fit(Xs_train, ys_train)
ys_pred = xgb_classifier.predict(Xs_test)

print('xgb acc_test score: ' accuracy_score(ys_test, ys_pred))

In [None]:
# Plot confusion matrix

confusion_mat = pd.crosstab(ys_test, ys_pred,
                            rownames=['Actual'],
                            colnames=['Predicted'],margins=True)
confusion_mat

In [None]:
# Print classification report

print(classification_report(ys_test, ys_pred))

In [None]:
# Plotting ROC curve for high salary (label 1)

fpr, tpr, thresholds = roc_curve(y_test22, y_pred22)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=[8,8])
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc, linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('ROC for high salary', fontsize=18)
plt.legend(loc="lower right")
plt.show()

**Model 1.5: Using CNN**

## Try balancing dataset

Write a function to run pipeline of using a variety of sampling techniques

In [18]:
# preprocessing of text data

t1 = datetime.now()
cvec = CountVectorizer(ngram_range = (1, 1), min_df = 3, stop_words = 'english')
Xs1 = cvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs1.shape)

0:02:35.543079
(608458, 66410)


In [None]:
t1 = datetime.now()
cvec = CountVectorizer(ngram_range = (1, 1), min_df = 3)
Xs2 = cvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs2.shape)

In [None]:
t1 = datetime.now()
cvec = CountVectorizer(ngram_range = (1, 2), min_df = 3)
Xs3 = cvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs3.shape)

In [None]:
def run_pipeline(X, y):

    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE
    from sklearn.ensemble import RandomForestClassifier
    
    def RandomForest(X,y):
        rfc = RandomForestClassifier(n_estimators = 200, random_state = 35)
        print(cross_val_score(rfc, X, y, cv = 2, n_jobs = -1))
        
    def xgb(X,y):
        xgb_classifier = xgb.XGBClassifier()
        print(cross_val_score(xgb_classifier, X, y, cv = 2, n_jobs = -1))
    
    def LogisticRegression(X,y):
        logreg = LogisticRegression(n_jobs = -1)
        print(cross_val_score(logreg, X, y, cv = 2, n_jobs = -1))
    
    # Undersampling and oversampling to solve for class imbalance
    model_undersample = RandomUnderSampler(random_state=42)
    X_undersample, y_undersample = model_undersample.fit_resample(Xs1, y)
    print('Undersampling:', X_undersample.shape, y_undersample.shape)
    RandomForest(X_undersample, y_undersample)
    xgb(X_undersample, y_undersample)
    LogisticRegression(X_undersample, y_undersample)
    print('\n')

    model_oversample = RandomOverSampler(random_state=42)
    X_oversample, y_oversample = model_oversample.fit_resample(Xs1, y)
    print('Oversampling:', X_oversample.shape, y_oversample.shape)
    RandomForest(X_oversample, y_oversample)
    xgb(X_oversample, y_oversample)
    LogisticRegression(X_oversample, y_oversample)
    print('\n')    

    model_ADASYN = ADASYN(random_state=42)
    X_ADASYN, y_ADASYN = model_ADASYN.fit_resample(Xs1,y)
    print('Oversampling ADASYN:', X_ADASYN.shape, y_ADASYN.shape)
    RandomForest(X_ADASYN, y_ADASYN)
    xgb(X_ADASYN, y_ADASYN)
    LogisticRegression(X_ADASYN, y_ADASYN)
    print('\n')    

    model_SMOTE = SMOTE(random_state=42)
    X_smote, y_smote = model_SMOTE.fit_resample(Xs1,y)
    print('Oversampling SMOTE:', X_smote.shape, y_smote.shape)
    RandomForest(X_smote, y_smote)
    xgb(X_smote, y_smote)
    LogisticRegression(X_smote, y_smote)
    print('\n')

In [None]:
run_pipeline(Xs2, y)

In [None]:
xgb_classifier = xgb.XGBClassifier()
score = cross_val_score(xgb_classifier, X_undersample, y_undersample, cv = 2, n_jobs = -1)


print(datetime.now() - t1)
print('Cross Validation Score: {:0.3} ± {:0.3}'.format(score.mean().round(3), score.std().round(3)))

## In case if want to use restaurant or user id mapping

In [7]:
# # Link reviews with restaurant name

# a = pd.merge(reviews, productIdMapping, how = 'left', left_on = ['prod_id'], right_on = ['prod_id'])

In [4]:
productIdMapping = pd.read_csv("productIdMapping", sep = '\t', names = ['Restaurant', 'prod_id'])

In [5]:
productIdMapping.shape

(5044, 2)

In [6]:
productIdMapping

Unnamed: 0,Restaurant,prod_id
0,Toast,0
1,Big Apple Lounge & Restaurant,1
2,La Carreta,2
3,Just Subs 6,2780
4,Formosa Asian Fusion Restaurant,4
5,Dona Mercedes,5
6,Subworks Pizza & Subs,6
7,Nai Tapas Bar,7
8,Selva Verde,8
9,Geno’s Steaks,9


In [31]:
userIdMapping = pd.read_csv("userIdMapping", sep = '\t', names = ['person_id', 'user_id'])

In [32]:
userIdMapping

Unnamed: 0,person_id,user_id
0,L4ANPtyHW1eQeXxq3Tkm1w,154347
1,FzbqZaNONgR-b5iHpoGp8Q,185064
2,F1n8h7MtVHPxgY4U1iQ-bQ,205085
3,sd00TiP5ENkm6rKYPn3p3A,38645
4,9-QHBM5i9--nUYrrj9DXXA,133524
5,FBjIChlJ437HqVe134bCVw,41313
6,2fVXWuQ7jtr8STZ6UWhPYQ,136755
7,m6i7NOocr7YlFg1CXhRV6Q,221370
8,aGWsrc0EUIz_DCGodUT-pg,100884
9,SumMCQ8oBNhZCp_X46EnWA,66978


In [34]:
reviewGraph = pd.read_csv("reviewGraph",names=["user_id","name","rating1"], sep = '\t')

In [35]:
reviewGraph

Unnamed: 0,user_id,name,rating1
0,5044,0,1.0
1,5045,0,1.0
2,5046,0,3.0
3,5047,0,5.0
4,5048,0,5.0
5,5049,0,5.0
6,5050,0,5.0
7,5051,0,1.0
8,5052,0,2.0
9,5053,0,4.0


In [37]:
reviewGraph.shape

(608598, 3)

In [38]:
reviewGraph['user_id'].value_counts()

8367      197
9501      185
7871      178
8225      159
10934     155
8351      142
7781      142
7777      136
8467      135
8011      133
8337      132
9105      132
19044     131
18780     127
13937     126
12874     123
14510     121
25871     118
14255     116
5095      112
5558      111
7688      110
9679      109
17116     109
11146     108
5620      107
8748      106
21344     106
19166     105
9755      105
         ... 
29999       1
21803       1
19752       1
55792       1
11556       1
220490      1
224588      1
136547      1
247129      1
138594      1
132449      1
134496      1
259423      1
261470      1
255325      1
257372      1
251227      1
253274      1
243031      1
222541      1
245078      1
238933      1
240980      1
234835      1
236882      1
230737      1
232784      1
226639      1
228686      1
6147        1
Name: user_id, Length: 260277, dtype: int64

In [39]:
reviewGraph['name'].value_counts()

3745    7378
4698    6632
3237    4716
3136    3938
1881    3143
3875    3122
2605    2999
3876    2959
828     2943
4223    2858
56      2696
127     2677
1941    2560
3318    2536
9       2497
4083    2292
1814    2183
2127    2165
1859    2158
496     2075
3215    2072
1401    1973
1341    1958
1597    1911
2223    1870
3618    1834
1298    1780
3582    1689
4034    1680
2265    1675
        ... 
2750       1
1414       1
1057       1
3243       1
1783       1
3428       1
2806       1
1707       1
392        1
1640       1
2595       1
3831       1
2536       1
1167       1
3726       1
683        1
3377       1
4641       1
2665       1
2295       1
3431       1
1168       1
3998       1
871        1
245        1
2292       1
4485       1
3942       1
2921       1
2943       1
Name: name, Length: 5044, dtype: int64