# Bhurtpore Inn - Trip Advisor Modeling

In [300]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [289]:
df = pd.read_csv('df.csv').drop(['Unnamed: 0'], axis=1)

# Modelling

In [566]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

### Baseline Calculation

In [698]:
df.rating.value_counts()

5    192
4     80
3     19
2      9
1      3
Name: rating, dtype: int64

In [705]:
df.rating.value_counts(normalize=True)[5]

0.6336633663366337

### Model Setup

In [334]:
model_df = df.copy()

y = model_df.pop('rating')
X = model_df

In [335]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)

In [336]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, stratify=y_over)

In [337]:
df.columns

Index(['username', 'rating', 'review_year', 'review_month', 'platform',
       'headline', 'post_text', 'visit_year', 'visit_month'],
      dtype='object')

In [338]:
ohe_r_year = OneHotEncoder()
ohe_r_month = OneHotEncoder()
ohe_v_year = OneHotEncoder()
ohe_v_month = OneHotEncoder()
ohe_platform = OneHotEncoder()
nlp_post = TfidfVectorizer(lowercase=True, stop_words='english')
nlp_head = TfidfVectorizer(lowercase=True, stop_words='english')

In [339]:
col_trans = ColumnTransformer([
    ('review_year', ohe_r_year, ['review_year']),
    ('review_month', ohe_r_month, ['review_month']),
    ('platform', ohe_platform, ['platform']),
    ('headline', nlp_head, 'headline'),
    ('post_text', nlp_post, 'post_text'),
    ('visit_year', ohe_v_year, ['visit_year']),
    ('visit_month', ohe_v_month, ['visit_month'])
])

In [564]:
log = LogisticRegression(penalty='l2', solver='saga', max_iter=10000, n_jobs=-2)
rfc = RandomForestClassifier(n_jobs=-2)
nnc = MLPClassifier(hidden_layer_sizes=(100, 40, 25))
knc = KNeighborsClassifier()

In [583]:
# pipe.get_params()

In [579]:
params = {
    'logisticregression__penalty' : ['l1', 'none', 'l2'],
    'logisticregression__solver' : ['lbfgs', 'saga'],
    'logisticregression__C' : [0.001, 0.005, 0.01, 0.05, 0.1, 0.2],
    'logisticregression__tol' : [0.0001, 0.001, 0.01, 0.05, 0.1]
}

In [582]:
gs = GridSearchCV(pipe, params, cv=5, n_jobs=-2, verbose=1)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


 0.38932165 0.38928919 0.38932165 0.39185329 0.74050633 0.73797468
 0.73544304 0.73037975 0.73037975 0.74047387 0.73291139 0.73287894
 0.72525154 0.73550795 0.74797144 0.74797144 0.74797144 0.74797144
 0.74797144 0.74797144 0.73774748 0.73521584 0.73521584 0.75060045
        nan        nan        nan        nan        nan 0.39185329
 0.38932165 0.39185329 0.38675755 0.39185329 0.74050633 0.73797468
 0.73544304 0.73037975 0.73037975 0.74044142 0.73031483 0.72275235
 0.74050633 0.74563453 0.7633236  0.7633236  0.7633236  0.7633236
 0.7633236  0.7633236  0.75822785 0.7556962  0.7633236  0.75822785
        nan        nan        nan        nan        nan 0.38932165
 0.38932165 0.39185329 0.38932165 0.39185329 0.74050633 0.73797468
 0.73544304 0.73037975 0.73037975 0.73790977 0.74303797 0.73541058
 0.73544304 0.73047712 0.76838689 0.76838689 0.76838689 0.76838689
 0.76838689 0.76838689 0.76838689 0.7633236  0.76585524 0.76838689
        nan        nan        nan        nan        nan 0.76332

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('review_year',
                                                                         OneHotEncoder(),
                                                                         ['review_year']),
                                                                        ('review_month',
                                                                         OneHotEncoder(),
                                                                         ['review_month']),
                                                                        ('platform',
                                                                         OneHotEncoder(),
                                                                         ['platform']),
                                                                        ('headline',
                                

In [590]:
gs.best_score_

0.7761116520610191

In [593]:
gs.best_params_

{'logisticregression__C': 0.1,
 'logisticregression__penalty': 'l2',
 'logisticregression__solver': 'lbfgs',
 'logisticregression__tol': 0.0001}

In [592]:
model = gs.best_estimator_

In [594]:
model

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('review_year',
                                                  OneHotEncoder(),
                                                  ['review_year']),
                                                 ('review_month',
                                                  OneHotEncoder(),
                                                  ['review_month']),
                                                 ('platform', OneHotEncoder(),
                                                  ['platform']),
                                                 ('headline',
                                                  TfidfVectorizer(stop_words='english'),
                                                  'headline'),
                                                 ('post_text',
                                                  TfidfVectorizer(stop_words='english'),
                                                  

Could get marginally better scores with neural network and random forest classifiers, although for the purpose of interpretability on the feature importance end, I decided to use the logistic regression classifier based on its closeness to the best scores of the preceding two combined with ability to satisfy the overall goal of the project:-

`to advise the Bhurtpore Inn pub on factors that primarily influence Trip Advisor ratings`

In [424]:
# pipe = make_pipeline(col_trans, nnc)
# pipe.fit(X_train, y_train)

# print(pipe.score(X_train, y_train))
# print(pipe.score(X_test, y_test))

# train score = 1.0
# test score = 0.8080808080808081

1.0
0.8080808080808081


In [394]:
# pipe = make_pipeline(col_trans, ofc)
# pipe.fit(X_train, y_train)

# print(pipe.score(X_train, y_train))
# print(pipe.score(X_test, y_test))

# train score = 1.0
# test score = 0.8080808080808081

1.0
0.8080808080808081


In [595]:
predictions = pd.DataFrame(model.predict(X_test), columns=['predictions'])
predictions['true'] = y_test.reset_index(drop=True)

In [596]:
predictions

Unnamed: 0,predictions,true
0,5,5
1,5,5
2,1,1
3,1,1
4,5,4
...,...,...
94,1,1
95,5,4
96,5,4
97,5,5


In [597]:
predictions['match'] = predictions.predictions==predictions.true

In [598]:
misses = predictions[predictions['match']==False]

In [603]:
misses

Unnamed: 0,predictions,true,match,off_by
4,5,4,False,1
14,5,2,False,3
22,5,4,False,1
26,1,4,False,-3
28,5,3,False,2
35,5,4,False,1
38,5,2,False,3
40,5,3,False,2
42,5,4,False,1
44,5,4,False,1


In [604]:
misses['off_by'] = misses.predictions - misses.true

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  misses['off_by'] = misses.predictions - misses.true


In [605]:
misses.drop(['match'], axis=1)

Unnamed: 0,predictions,true,off_by
4,5,4,1
14,5,2,3
22,5,4,1
26,1,4,-3
28,5,3,2
35,5,4,1
38,5,2,3
40,5,3,2
42,5,4,1
44,5,4,1


# Finished Model

In [606]:
import joblib

In [607]:
# joblib.dump(pipe, 'model.pkl')

['model.pkl']