In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, StackingClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import plot_roc_curve

In [52]:
import warnings
warnings.filterwarnings('ignore')

In [53]:
data_list = []
for i in range(0,7):
    data = pd.read_csv(f'data/cleaned_review_data_{i}.csv', index_col=0)
    data_list.append(data)

df = pd.concat(data_list, ignore_index=True)

In [54]:
df['stars'] = df['stars'].astype('int64')

In [55]:
# CLASSIFICATION
data_classes = df[(df['stars']==1) | (df['stars']==3) | (df['stars']==5)]
print(data_classes.shape)

(801271, 22)


In [56]:
X = data_classes.drop(columns=['stars', 'review_id', 'user_id', 'business_id', 'text', 'date', 'month', 'day', 'hour', 'min', 'cleaned_text'])
y = data_classes['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

In [57]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [58]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [59]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('DecisionTreeClassifier() test accuracy:', model.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))

DecisionTreeClassifier() test accuracy: 0.7600574085051949
[[25148  2154  5660]
 [ 5054  5340 18509]
 [ 3710  3365 91315]]


In [60]:
print(classification_report(y_test, y_pred, target_names = ['1', '3', '5']))

              precision    recall  f1-score   support

           1       0.74      0.76      0.75     32962
           3       0.49      0.18      0.27     28903
           5       0.79      0.93      0.85     98390

    accuracy                           0.76    160255
   macro avg       0.67      0.63      0.62    160255
weighted avg       0.73      0.76      0.73    160255



In [61]:
X_ = np.concatenate((X_train, X_test))
y_ = np.concatenate((y_train, y_test))

y_pred = model.predict(X_)

print('Test accuracy:', model.score(X_, y_))

print(confusion_matrix(y_, y_pred))

print(classification_report(y_, y_pred))

Test accuracy: 0.7644829277485395
[[127636  10828  27846]
 [ 24759  27896  91147]
 [ 17914  16219 457026]]
              precision    recall  f1-score   support

           0       0.75      0.77      0.76    166310
           1       0.51      0.19      0.28    143802
           2       0.79      0.93      0.86    491159

    accuracy                           0.76    801271
   macro avg       0.68      0.63      0.63    801271
weighted avg       0.73      0.76      0.73    801271



In [72]:
final_df = data_classes.copy()
# convert encoder to original scale
final_df['pred_stars'] = y_pred+1

final_df = final_df[abs(final_df['pred_stars']-final_df['stars'])<4]

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,year,...,min,sec,length_of_reviews,num_of_words,num_of_sentences,capital_words_ratio,cleaned_text,sentiment_polarity,sentiment_subjectivity,pred_stars
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,2018,...,9,11,513,114,7,0.026316,decide eat aware going take hours beginning en...,0.209722,0.419444,3
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,2012,...,28,18,829,174,7,0.011494,ive taken lot spin classes years nothing compa...,0.395455,0.571212,3
2,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,2015,...,1,3,243,56,6,0.000000,wow yummy different delicious favorite lamb cu...,0.279545,0.713068,3
3,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31,2015,...,10,31,341,79,10,0.063291,long term frequent customer establishment went...,0.211111,0.405556,3
4,_ZeMknuYdlQcUqng_Im3yg,yfFzsLmaWF2d4Sr0UNbBgg,LHSTtnW3YHCeUkRDGyJOyw,5,2,0,0,Amazingly amazing wings and homemade bleu chee...,2015-08-07 02:29:16,2015,...,29,16,192,34,4,0.029412,amazingly amazing wings homemade bleu cheese r...,0.505556,0.788889,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1211136,YAAmRuqBqaFV6kW6ozjBpQ,aFLEltFU8TVgYS1C6tzuUA,8XjxHeV66F4eoIy06rW0pA,5,4,0,1,"Porta, Porta, Porta, one of my new FAVORITE sp...",2018-08-04 17:47:21,2018,...,47,21,1658,364,23,0.041209,porta porta porta one new favorite spots week ...,0.271468,0.576610,3
1211141,X5R98ygOtbhryDiKA-J2qQ,LHWtjTG7e1NzNPYUbUo-9w,rgeuy1qbw6Z8B6CSVANHIA,5,1,1,1,I've been to the other Federal Donuts location...,2012-10-13 14:39:37,2012,...,39,37,752,161,10,0.062112,ive federal donuts location multiple times lov...,0.191912,0.414706,3
1211142,MVg4YUQeEhCA7Z7RsBJSVg,7-7A0Avj47slLGV7yBFc8w,ytynqOUb3hjKeJfRj5Tshw,3,1,0,0,"I was so excited about all the food I saw, but...",2013-07-25 21:00:15,2013,...,0,15,262,69,4,0.115942,excited food saw unfortunately place closes ea...,-0.006250,0.637500,3
1211143,nLjbVsETpqO17RbFcqskkA,am7-gkH_PDz598oTdYSD6A,3gVSrS4kffGGZT8oXHsIcw,3,2,0,2,"*Later Yelp* I've only been here once, but I l...",2014-11-03 14:45:46,2014,...,45,46,361,81,6,0.061728,later yelp ive love place mainly atmosphere co...,0.316667,0.597619,3


In [73]:
final_df.to_csv('rmfake/reviews_without_fake.csv', index=False)