In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, accuracy_score

from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression


Join both review and metadata datasets from zip file

In [2]:
reviewContent = pd.read_csv("ReviewContent", names = ['user_id', 'prod_id', 'date', 'review'], sep = '\t', parse_dates = ['date'])
metadata = pd.read_csv("metadata",names=["user_id", "prod_id", "rating", "label", "date"], sep = '\t', parse_dates = ['date'])



reviews = pd.merge(reviewContent, metadata, how = 'left', 
                    left_on = ['user_id', 'prod_id', 'date'], right_on = ['user_id', 'prod_id', 'date'])

In [3]:
reviews.head()

Unnamed: 0,user_id,prod_id,date,review,rating,label
0,5044,0,2014-11-16,"Drinks were bad, the hot chocolate was watered...",1.0,-1
1,5045,0,2014-09-08,This was the worst experience I've ever had a ...,1.0,-1
2,5046,0,2013-10-06,This is located on the site of the old Spruce ...,3.0,-1
3,5047,0,2014-11-30,I enjoyed coffee and breakfast twice at Toast ...,5.0,-1
4,5048,0,2014-08-28,I love Toast! The food choices are fantastic -...,5.0,-1


In [4]:
# Convert classification of fake reviews from '-1' to '0'

reviews['label'] = reviews['label'].apply(lambda x: 0 if x == -1 else x)

In [5]:
reviews['label'].value_counts()

1    528019
0     80439
Name: label, dtype: int64

In [6]:
1- np.mean(reviews['label'])

0.13220140091838717

Given the significant class imbalance where 13.2% of all reviews are fake (based on Yelp's proprietary algorithm), we will look to address this class imbalance to improve our predictions. 

## Part 1: EDA to better understand what might constitute fake reviews

In [8]:
fake_reviews = reviews[reviews['label'] == 0]
fake_reviews.shape

(80439, 6)

## Part 2: Experiment with using only review text to predict rating (w/o feature engineering)

In [12]:
model1 = reviews[['review', 'label']]

In [13]:
model1.head()

Unnamed: 0,review,label
0,"Drinks were bad, the hot chocolate was watered...",0
1,This was the worst experience I've ever had a ...,0
2,This is located on the site of the old Spruce ...,0
3,I enjoyed coffee and breakfast twice at Toast ...,0
4,I love Toast! The food choices are fantastic -...,0


In [14]:
X = reviews['review']
y = reviews['label']

We will first use cross_validation to do a high level evaluation of the effectiveness of various model types and sampling methods using the function below. 

***Modelling techniques***
- 1) RandomForestClassifier
- 2) Xgboost
- 3) LogisticRegression

***Sampling techniques***
- 1) Oversampling
- 2) ADASYN
- 3) SMOTE

*Baseline would be 50% now since we are resampling the dataset

In [10]:
from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE

def run_pipeline(X, y):
    t1 = datetime.now()

    
    def RandomForest(X,y):
        rfc = RandomForestClassifier(n_estimators = 10, random_state = 35)
        print(cross_val_score(rfc, X, y, cv = 2, n_jobs = -1))
        
    def xgbooster(X,y):
        xgb_classifier = xgb.XGBClassifier()
        print(cross_val_score(xgb_classifier, X, y, cv = 2, n_jobs = -1))
    
    def LogReg(X,y):
        logreg = LogisticRegression()
        print(cross_val_score(logreg, X, y, cv = 2, n_jobs = -1))
    
    # Various oversampling techniques to solve for class imbalance

    model_oversample = RandomOverSampler(random_state=42)
    X_oversample, y_oversample = model_oversample.fit_resample(X, y)
    print('Oversampling:', X_oversample.shape, y_oversample.shape)
    RandomForest(X_oversample, y_oversample)
    print(datetime.now() - t1)

    xgbooster(X_oversample, y_oversample)
    print(datetime.now() - t1)

    LogReg(X_oversample, y_oversample)
    print(datetime.now() - t1)

    print('\n')    

    model_ADASYN = ADASYN(random_state=42)
    X_ADASYN, y_ADASYN = model_ADASYN.fit_resample(X,y)
    print('Oversampling ADASYN:', X_ADASYN.shape, y_ADASYN.shape)
    RandomForest(X_ADASYN, y_ADASYN)
    print(datetime.now() - t1)

    xgbooster(X_ADASYN, y_ADASYN)
    print(datetime.now() - t1)

    LogReg(X_ADASYN, y_ADASYN)
    print(datetime.now() - t1)

    print('\n')    

    model_SMOTE = SMOTE(random_state=42)
    X_smote, y_smote = model_SMOTE.fit_resample(X,y)
    print('Oversampling SMOTE:', X_smote.shape, y_smote.shape)
    RandomForest(X_smote, y_smote)
    print(datetime.now() - t1)

    xgbooster(X_smote, y_smote)
    print(datetime.now() - t1)

    LogReg(X_smote, y_smote)
    print(datetime.now() - t1)

    print('\n')

We will run the above pipeline across various NLP preprocessing techniques (TfidVectorizer and CountVectorizer) and adjusting various parameters to see which gives the best score. 

**Config 1.1 - Preprocessing of text data by INCLUDING stopwords**

In [15]:
t1 = datetime.now()
cvec = CountVectorizer(ngram_range = (1, 1), min_df = 3, stop_words = 'english')
Xs1 = cvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs1.shape)

0:01:06.143387
(608458, 66410)


In [None]:
run_pipeline(Xs1, y)

Oversampling: (1056038, 66410) (1056038,)


In [None]:
# adjusted model since SMOTE and ADASYN are performing much better than oversampling

**Config 1.2 - Preprocessing of text data by EXCLUDING stopwords**

In [14]:
# preprocessing of text data by EXCLUDING stopwords

t1 = datetime.now()
cvec = CountVectorizer(ngram_range = (1, 1), min_df = 3)
Xs2 = cvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs2.shape)

0:02:18.144109
(608458, 66718)


In [None]:
run_pipeline(Xs2, y)

**Config 1.3 - Preprocessing of text data by EXCLUDING stopwords and with a wider range of ngrams**

In [None]:
t1 = datetime.now()
cvec = CountVectorizer(ngram_range = (2, 3), min_df = 3)
Xs3 = cvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs3.shape)

In [None]:
run_pipeline(Xs3, y)

**Config 1.4 - Preprocessing of text data using TfidVectorizer by EXCLUDING stopwords and with a wider range of ngrams**

In [None]:
t1 = datetime.now()
tvec = TfidVectorizer(ngram_range = (2, 3), min_df = 3)
Xs4 = tvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs4.shape)

In [None]:
run_pipeline(Xs4, y)

**Config 1.5 - Preprocessing of text data by EXCLUDING stopwords with ngrams = 1 and lemmatizer**

In [None]:
t1 = datetime.now()
cvec = CountVectorizer(ngram_range = (1, 1), min_df = 3)
Xs5 = cvec.fit_transform(X)

print(datetime.now() - t1)
print(Xs5.shape)

In [None]:
run_pipeline(Xs5, y)

## Part 3: Experiment with an alternative model incorporating some feature engineering from part 1 (but not using the text itself) to see if the model performance can be improved

## Part 4: Optimising model and parameter tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 200, random_state = 35)
score = cross_val_score(rfc, Xs1, y, cv = 2, n_jobs = -1)

print(datetime.now() - t1)
print(score)

**Model 1.4: Using xgboost**

In [16]:

xgb_classifier = xgb.XGBClassifier()
score = cross_val_score(xgb_classifier, Xs1, y, cv = 2, n_jobs = -1)
print('Cross Validation Score: {:0.3} ± {:0.3}'.format(score.mean().round(3), score.std().round(3)))

print(datetime.now() - t1)
print(score)

Cross Validation Score: 0.868 ± 0.0
7:53:50.398131
[0.86780068 0.86781296]


In [None]:
# split data into training and testing datasets

Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, y, test_size = 0.3)

In [None]:
xgb_classifier.fit(Xs_train, ys_train)
ys_pred = xgb_classifier.predict(Xs_test)

print('xgb acc_test score: ' accuracy_score(ys_test, ys_pred))

In [None]:
# Plot confusion matrix

confusion_mat = pd.crosstab(ys_test, ys_pred,
                            rownames=['Actual'],
                            colnames=['Predicted'],margins=True)
confusion_mat

In [None]:
# Print classification report

print(classification_report(ys_test, ys_pred))

In [None]:
# Plotting ROC curve for high salary (label 1)

fpr, tpr, thresholds = roc_curve(y_test22, y_pred22)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=[8,8])
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc, linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('ROC for high salary', fontsize=18)
plt.legend(loc="lower right")
plt.show()

## Try balancing dataset

Write a function to run pipeline of using a variety of sampling techniques

In [13]:
run_pipeline(Xs1, y)

Oversampling ADASYN: (1062620, 66410) (1062620,)
1:33:02.560207
[0.77781563 0.82939683]
1:40:21.711382
[0.82721796 0.81186466]
1:46:56.507636


Oversampling SMOTE: (1056038, 66410) (1056038,)
2:08:56.176967
[0.77878679 0.83081448]
2:16:06.131034
[0.82955002 0.80796867]
2:23:54.631182




In [None]:
run_pipeline(Xs2, y)

Undersampling: (160878, 66718) (160878,)
[0.59576082 0.59602427]
0:02:48.640526
[0.64521382 0.64305428]
0:05:05.102977
[0.62255097 0.6194709 ]
0:07:25.260128


Oversampling: (1056038, 66718) (1056038,)
[0.90533503 0.91234011]
0:42:48.615760
[0.64311579 0.64750255]
0:57:07.186804
[0.66410742 0.66176532]
1:53:08.313904




In [None]:
xgb_classifier = xgb.XGBClassifier()
score = cross_val_score(xgb_classifier, X_undersample, y_undersample, cv = 2, n_jobs = -1)


print(datetime.now() - t1)
print('Cross Validation Score: {:0.3} ± {:0.3}'.format(score.mean().round(3), score.std().round(3)))

## In case if want to use restaurant or user id mapping

In [7]:
# # Link reviews with restaurant name

# a = pd.merge(reviews, productIdMapping, how = 'left', left_on = ['prod_id'], right_on = ['prod_id'])

In [4]:
productIdMapping = pd.read_csv("productIdMapping", sep = '\t', names = ['Restaurant', 'prod_id'])

In [5]:
productIdMapping.shape

(5044, 2)

In [6]:
productIdMapping

Unnamed: 0,Restaurant,prod_id
0,Toast,0
1,Big Apple Lounge & Restaurant,1
2,La Carreta,2
3,Just Subs 6,2780
4,Formosa Asian Fusion Restaurant,4
5,Dona Mercedes,5
6,Subworks Pizza & Subs,6
7,Nai Tapas Bar,7
8,Selva Verde,8
9,Geno’s Steaks,9


In [31]:
userIdMapping = pd.read_csv("userIdMapping", sep = '\t', names = ['person_id', 'user_id'])

In [32]:
userIdMapping

Unnamed: 0,person_id,user_id
0,L4ANPtyHW1eQeXxq3Tkm1w,154347
1,FzbqZaNONgR-b5iHpoGp8Q,185064
2,F1n8h7MtVHPxgY4U1iQ-bQ,205085
3,sd00TiP5ENkm6rKYPn3p3A,38645
4,9-QHBM5i9--nUYrrj9DXXA,133524
5,FBjIChlJ437HqVe134bCVw,41313
6,2fVXWuQ7jtr8STZ6UWhPYQ,136755
7,m6i7NOocr7YlFg1CXhRV6Q,221370
8,aGWsrc0EUIz_DCGodUT-pg,100884
9,SumMCQ8oBNhZCp_X46EnWA,66978


In [34]:
reviewGraph = pd.read_csv("reviewGraph",names=["user_id","name","rating1"], sep = '\t')

In [35]:
reviewGraph

Unnamed: 0,user_id,name,rating1
0,5044,0,1.0
1,5045,0,1.0
2,5046,0,3.0
3,5047,0,5.0
4,5048,0,5.0
5,5049,0,5.0
6,5050,0,5.0
7,5051,0,1.0
8,5052,0,2.0
9,5053,0,4.0


In [37]:
reviewGraph.shape

(608598, 3)

In [38]:
reviewGraph['user_id'].value_counts()

8367      197
9501      185
7871      178
8225      159
10934     155
8351      142
7781      142
7777      136
8467      135
8011      133
8337      132
9105      132
19044     131
18780     127
13937     126
12874     123
14510     121
25871     118
14255     116
5095      112
5558      111
7688      110
9679      109
17116     109
11146     108
5620      107
8748      106
21344     106
19166     105
9755      105
         ... 
29999       1
21803       1
19752       1
55792       1
11556       1
220490      1
224588      1
136547      1
247129      1
138594      1
132449      1
134496      1
259423      1
261470      1
255325      1
257372      1
251227      1
253274      1
243031      1
222541      1
245078      1
238933      1
240980      1
234835      1
236882      1
230737      1
232784      1
226639      1
228686      1
6147        1
Name: user_id, Length: 260277, dtype: int64

In [39]:
reviewGraph['name'].value_counts()

3745    7378
4698    6632
3237    4716
3136    3938
1881    3143
3875    3122
2605    2999
3876    2959
828     2943
4223    2858
56      2696
127     2677
1941    2560
3318    2536
9       2497
4083    2292
1814    2183
2127    2165
1859    2158
496     2075
3215    2072
1401    1973
1341    1958
1597    1911
2223    1870
3618    1834
1298    1780
3582    1689
4034    1680
2265    1675
        ... 
2750       1
1414       1
1057       1
3243       1
1783       1
3428       1
2806       1
1707       1
392        1
1640       1
2595       1
3831       1
2536       1
1167       1
3726       1
683        1
3377       1
4641       1
2665       1
2295       1
3431       1
1168       1
3998       1
871        1
245        1
2292       1
4485       1
3942       1
2921       1
2943       1
Name: name, Length: 5044, dtype: int64