In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, StackingClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import plot_roc_curve

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
data_list = []
for i in range(0,7):
    data = pd.read_csv(f'data/cleaned_review_data_{i}.csv', index_col=0)
    data_list.append(data)

df = pd.concat(data_list, ignore_index=True)

In [4]:
df['stars'] = df['stars'].astype('int64')

In [5]:
# CLASSIFICATION
data_classes = df[(df['stars']==1) | (df['stars']==3) | (df['stars']==5)]
data_classes.head()
print(data_classes.shape)

(801271, 22)


In [6]:
X = data_classes.drop(columns=['stars', 'review_id', 'user_id', 'business_id', 'text', 'date', 'month', 'day', 'hour', 'min', 'cleaned_text'])
y = data_classes['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

In [7]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [9]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('DecisionTreeClassifier() test accuracy:', model.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))

DecisionTreeClassifier() test accuracy: 0.6510186889644629
[[21033  5904  6333]
 [ 5614  8298 14859]
 [ 6840 16376 74998]]


In [15]:
print(classification_report(y_test, y_pred, target_names = ['1', '3', '5']))

              precision    recall  f1-score   support

           1       0.63      0.63      0.63     33270
           3       0.27      0.29      0.28     28771
           5       0.78      0.76      0.77     98214

    accuracy                           0.65    160255
   macro avg       0.56      0.56      0.56    160255
weighted avg       0.66      0.65      0.65    160255



todo: try draw ROC curve for each type

In [26]:
X_ = np.concatenate((X_train, X_test))
y_ = np.concatenate((y_train, y_test))

y_pred = model.predict(X_)

print('Test accuracy:', model.score(X_, y_))

print(confusion_matrix(y_, y_pred))

print(classification_report(y_, y_pred))

Test accuracy: 0.9302008933307209
[[154073   5904   6333]
 [  5615 123328  14859]
 [  6841  16376 467942]]
              precision    recall  f1-score   support

           0       0.93      0.93      0.93    166310
           1       0.85      0.86      0.85    143802
           2       0.96      0.95      0.95    491159

    accuracy                           0.93    801271
   macro avg       0.91      0.91      0.91    801271
weighted avg       0.93      0.93      0.93    801271



In [27]:
final_df = pd.concat((X, y), axis = 1)
# convert encoder to original scale
final_df['pred_stars'] = y_pred+1

final_df = final_df[abs(final_df['pred_stars']-final_df['stars'])<4]
final_df

Unnamed: 0,useful,funny,cool,year,sec,length_of_reviews,num_of_words,num_of_sentences,capital_words_ratio,sentiment_polarity,sentiment_subjectivity,stars,pred_stars
0,0,0,0,2018,11,513,114,7,0.026316,0.209722,0.419444,3,1
1,1,0,1,2012,18,829,174,7,0.011494,0.395455,0.571212,5,3
2,1,0,1,2015,3,243,56,6,0.000000,0.279545,0.713068,5,3
3,1,2,1,2015,31,341,79,10,0.063291,0.211111,0.405556,1,2
4,2,0,0,2015,16,192,34,4,0.029412,0.505556,0.788889,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1211136,4,0,1,2018,21,1658,364,23,0.041209,0.271468,0.576610,5,3
1211141,1,1,1,2012,37,752,161,10,0.062112,0.191912,0.414706,5,3
1211142,1,0,0,2013,15,262,69,4,0.115942,-0.006250,0.637500,3,3
1211143,2,0,2,2014,46,361,81,6,0.061728,0.316667,0.597619,3,3
