# Reading in data and imports

In [1]:
# Need these
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from scipy.sparse import hstack


#Natural language toolkit. Download if not installed already
import nltk
from nltk import WordNetLemmatizer
#nltk.download('stopwords')
#nltk.download('wordnet')

# For splitting by punctuation and using regex
import re
import string

# Useful
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer

# Potentially used Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import LinearSVC, SVC

# Evaluation and feature selection tools
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_selection import chi2, mutual_info_classif

In [2]:
# Test not useful right now
test_meta = pd.read_csv("review_meta_test.csv", sep=",", names=["label", "body_text"])
test_text = pd.read_csv("review_meta_train.csv", sep=',', names=["label", "body_text"])

# Also not useful right now
train_meta = pd.read_csv("review_meta_train.csv", sep=",")
train_text = pd.read_csv("review_text_train.csv", sep=",")

# Ids are not predictive. Consider adding in the date?
train_meta = train_meta.drop(["date", "review_id", "reviewer_id", "business_id"], axis = 1)

# Combine the meta features and text. Move class to end
train_data = pd.concat([train_meta, train_text], axis=1)
class_col = train_data.pop("rating")
train_data["rating"] = class_col
train_data.head()

Unnamed: 0,vote_funny,vote_cool,vote_useful,review,rating
0,0,1,3,dear longman & eagle.......you've left me no c...,1
1,0,0,0,Delish. The hubby and I wanted to do brunch on...,5
2,1,0,1,"yep, I've giving Yolk 5 stars. It's just reall...",5
3,17,3,3,"Meat, meat, meat. It's meat-tastic. So much me...",3
4,0,0,0,I caught up with the law school girls on a Sat...,3


# Preprocessing the review text. Extension after building models on vectors provided
We want to remove punctuation and numbers, tokenise every review's words, lemmatise the words, then remove all the words in the stopwords list.


In [3]:
# This code will take extremely long to execute. Might be useful later if we're generating our own features.
# Make sure to run once and export to csv
# Just use the vectors provided for now

punct = string.punctuation
stopwords = nltk.corpus.stopwords.words("english")
wordnet = WordNetLemmatizer()

# Preprocesses each review in the dataframe
def clean_review(review):
    # Reassign the string after every change
    
    # Remove puncutation and numbers
    no_punct = ''.join(char for char in review if char not in punct and not char.isdigit())
    
    # Tokenize into words
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    words = tokenizer.tokenize(no_punct)
    
    # Remove stopwords and lemmatize, move to lower case
    no_stopwords = [wordnet.lemmatize(word).lower() for word in words if word not in stopwords]
    
    return no_stopwords

In [4]:
# Clean each review in the dataframe? Takes ~10 mins for all reviews
train_data.loc[:1,"review"] = train_data.review.apply(lambda x : clean_review(x))
train_data.head()

Unnamed: 0,vote_funny,vote_cool,vote_useful,review,rating
0,0,1,3,"[dear, longman, eagleyouve, left, choice, the,...",1
1,0,0,0,"[delish, the, hubby, i, wanted, brunch, sunday...",5
2,1,0,1,"yep, I've giving Yolk 5 stars. It's just reall...",5
3,17,3,3,"Meat, meat, meat. It's meat-tastic. So much me...",3
4,0,0,0,I caught up with the law school girls on a Sat...,3


# Using already preprocessed vectors for the reviews

In [5]:
# OR we can just use the preprocessed text data

# countvec
import pickle
vocab = pickle.load(open("review_text_features_countvec/train_countvectorizer.pkl", "rb"))
vocab_dict = vocab.vocabulary_
sparse_matrix_train = scipy.sparse.load_npz("review_text_features_countvec/review_text_train_vec.npz")
sparse_matrix_test = scipy.sparse.load_npz("review_text_features_countvec/review_text_test_vec.npz")

# doc2vec 50, 100, 200 features vector for training
d2v_50_train = pd.read_csv(r"review_text_features_doc2vec50/review_text_train_doc2vec50.csv", index_col = False, delimiter = ",", header=None)
d2v_100_train = pd.read_csv(r"review_text_features_doc2vec100/review_text_train_doc2vec100.csv", index_col = False, delimiter = ",", header=None)
d2v_200_train = pd.read_csv(r"review_text_features_doc2vec200/review_text_train_doc2vec200.csv", index_col = False, delimiter = ",", header=None)

# doc2vec 50, 100, 200 features vector for testing
d2v_50_test = pd.read_csv(r"review_text_features_doc2vec50/review_text_test_doc2vec50.csv", index_col = False, delimiter = ",", header=None)
d2v_100_test = pd.read_csv(r"review_text_features_doc2vec100/review_text_test_doc2vec100.csv", index_col = False, delimiter = ",", header=None)
d2v_200_test = pd.read_csv(r"review_text_features_doc2vec200/review_text_test_doc2vec200.csv", index_col = False, delimiter = ",", header=None)

FileNotFoundError: [Errno 2] No such file or directory: 'review_text_features_countvec/train_countvectorizer.pkl'

# Split data and form train and validation sets


In [121]:
# Here, choose which predefined and given vector to use to represent the text data.
# Text here is already processed

# Function to combine meta features (optional?)  
def preprocess(type, randomstate = 8579):
    # For each of the doc2vec vectors, concat with meta features
    
    if type == "50":
        train = d2v_50_train
        #train = pd.concat([features.reset_index(), train_meta.reset_index()], axis = 1)
        
    elif type == "100" :
        train = d2v_100_train
        #train = pd.concat([features.reset_index(), train_meta.reset_index()], axis = 1)
        
    elif type == "200" :
        train = d2v_200_train
        #train = pd.concat([features.reset_index(), train_meta.reset_index()], axis = 1)
    
    # If we chose a doc2vec vector, split and return the training and validation sets
    if type != "count" and type != "vectoriser":
        # Above concat method causes duplicating index column
        train = train.loc[:,~train.columns.duplicated()]
        
        X_train, X_vali, Y_train, Y_vali = train_test_split(train[train.columns[:-1]],
                                                        train_meta["rating"], test_size=0.20, random_state=randomstate)
        return X_train, X_vali, Y_train, Y_vali
    
    # Otherwise do something with the sparse matrix
    elif type == "count":
        X_train, X_vali, Y_train, Y_vali = train_test_split(sparse_matrix_train, 
                                                            train_data["rating"], test_size=0.20, random_state=randomstate)
        return X_train, X_vali, Y_train, Y_vali

    # Manually make a count vector, rather than use the sparse matrix
    elif type == "vectoriser":
        # Split the reviews and ratings
        X_train, X_vali, Y_train, y_vali = train_test_split(train_data["review"], train_data["rating"], test_size=0.20, random_state=randomstate)
        X_train_txt, y_train = np.array(X_train), np.array(X_train)
        X_test_txt, y_test = np.array(X_vali), np.array(y_vali)

        # vectorise the reviews
        vectoriser = CountVectorizer(ngram_range=(1, 1), stop_words='english')
        vectoriser.fit(X_train_txt)
        X_train = vectoriser.transform(X_train_txt)
        X_test = vectoriser.transform(X_test_txt)

        return X_train, X_test, Y_train, y_test

# Fit and train models and Crudely evaluate

In [122]:
# Crude accuracy score and confusion matrix of predictions with % correct. Matrix is "Predicted label x for an instance
# with Class label y"

def evaluate(truthlist, predictions):
    # First calculate a crude accuracy score
    correct = 0;
    wrong = 0;
    for i in range(0,len(truthlist)):
        if(truthlist[i] == predictions[i]):
            correct += 1
        else:
            wrong += 1;
    print("The accuracy of the predictions is: {:.5f}\n".format(correct/(correct + wrong)))
        
    # Now construct a confusion matrix of each attribute
    truthSeries = pd.Series(truthlist, name = "Truths")
    predictionSeries = pd.Series(predictions, name = "Predictions")
    
    # Now normalise the confusion matrix so its a percentage of classification performance
    confusionDf = pd.crosstab(truthSeries, predictionSeries, rownames=["Truths"], colnames=["Predicted"], margins=False)
    confusionDfNormalised = confusionDf / confusionDf.sum(axis=0)
    print("Confusion Matrix of Correctly Labeled Classes %'s\n")
    print(confusionDfNormalised)
    print("\n\n")

    
    return

import csv

# Also need a function to export the predictions to a CSV
def export(instanceid, predictions):
    f = open("output.csv", "w", newline='')
    writer = csv.writer(f)
    writer.writerow(["Instance_id", "rating"])
    
    for i, j in zip(instanceid, predictions):
        f.write(str(i) + "," + str(j))
        f.write("\n")
    f.close()

In [86]:
category = ["1", "3", "5"]
datasets = ["50", "100", "200", "count", "vectoriser"] # don't include count for now

# Binomial Naive Bayes
Binomial NB would only work for discrete data. Possibly discretise doc2vec vectors?

In [32]:
print('\033[1m' + f"Trained with count vectoriser" '\033[0m')
X_train, X_vali, Y_train, Y_vali = preprocess("count")
clf = BernoulliNB().fit(X_train, Y_train)
Y_pred = clf.predict(X_vali)
evaluate(Y_pred, Y_vali.values)

[1mTrained with count vectoriser[0m
The accuracy of the predictions is: 0.73477

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.334764  0.037296  0.015022
3          0.263948  0.418026  0.096348
5          0.401288  0.544678  0.888630





# Gaussian Naive bayes
Probably use the doc2vec continuous vectors for this

In [46]:
for i in datasets:
    if(i != "count"):
        print('\033[1m' + f"Trained with doc2vec_{i} features" '\033[0m')
        X_train, X_vali, Y_train, Y_vali = preprocess(i)
        clf = GaussianNB().fit(X_train, Y_train)
        Y_pred = clf.predict(X_vali)
        evaluate(Y_pred, Y_vali.values)
    else:
        print("Haven't used count vector for GNB")

[1mTrained with doc2vec_50 features[0m
The accuracy of the predictions is: 0.72337

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.538627  0.111888  0.052059
3          0.190987  0.465423  0.116291
5          0.270386  0.422688  0.831650



[1mTrained with doc2vec_100 features[0m
The accuracy of the predictions is: 0.66726

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.553648  0.150738  0.084952
3          0.152361  0.430458  0.155141
5          0.293991  0.418803  0.759907



[1mTrained with doc2vec_200 features[0m
The accuracy of the predictions is: 0.61632

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.538627  0.195027  0.127687
3          0.113734  0.379176  0.167573
5          0.347639  

# Multinomial Naive bayes
Same with binomial Nb. Use the countvector, or discretise the continuous data in doc2vec

In [116]:
print('\033[1m' + f"Trained with count vectoriser" '\033[0m')
X_train, X_vali, Y_train, Y_vali = preprocess("vectoriser")
clf = MultinomialNB().fit(X_train, Y_train)
Y_pred = clf.predict(X_vali)
evaluate(Y_pred, Y_vali)


[1mTrained with count vectoriser[0m
The accuracy of the predictions is: 0.83791

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.583691  0.043512  0.004403
3          0.309013  0.605284  0.049469
5          0.107296  0.351204  0.946128





# Logistic Regression
works and runs for all of the vectors provided

In [128]:
for i in datasets:
    print('\033[1m' + f"Trained with doc2vec_{i} features" '\033[0m')
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

[1mTrained with doc2vec_50 features[0m
The accuracy of the predictions is: 0.81332

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.480687  0.060606  0.005957
3          0.304721  0.563326  0.057239
5          0.214592  0.376068  0.936804



[1mTrained with doc2vec_100 features[0m
The accuracy of the predictions is: 0.82508

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.532189  0.055167  0.007252
3          0.251073  0.595183  0.055685
5          0.216738  0.349650  0.937063



[1mTrained with doc2vec_200 features[0m
The accuracy of the predictions is: 0.82829

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.553648  0.064491  0.008547
3          0.242489  0.599845  0.053872
5          0.203863  

AttributeError: 'numpy.ndarray' object has no attribute 'values'

# Linear SVC

In [None]:
# Each SVM Takes around 15-20 minutes to run...
for i in datasets:
    print(f"Training using doc2vec_{i} data set")
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = LinearSVC(max_iter=5000).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

# SVC with Linear kernel

In [123]:
C = 1.0

for i in datasets:
    print(f"Training using doc2vec_{i} data set")
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = SVC(kernel='linear', C=C).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

Training using doc2vec_50 data set
The accuracy of the predictions is: 0.81279

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.480687  0.061383  0.007252
3          0.298283  0.555556  0.054131
5          0.221030  0.383061  0.938617



Training using doc2vec_100 data set
The accuracy of the predictions is: 0.82561

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.547210  0.054390  0.007511
3          0.246781  0.585859  0.053354
5          0.206009  0.359751  0.939135



Training using doc2vec_200 data set


KeyboardInterrupt: 

# SVC with RBF Kernel

In [None]:
for i in datasets:
    print(f"Training using doc2vec_{i} data set")
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = SVC(kernel='rbf', gamma=0.7, C=C).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

# SVC with poly kernel

In [None]:
for i in datasets:
    print(f"Training using doc2vec_{i} data set")
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = SVC(kernel='poly', degree=3, gamma='auto', C=1.0).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

# Random forest Classifier

In [47]:
for i in datasets:
    print(f"Training using doc2vec_{i} data set")
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = RandomForestClassifier(n_estimators=100).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)


Training using doc2vec_50 data set
The accuracy of the predictions is: 0.77431

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.272532  0.020979  0.001036
3          0.281116  0.352758  0.023569
5          0.446352  0.626263  0.975395



Training using doc2vec_100 data set
The accuracy of the predictions is: 0.75472

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.148069  0.006216  0.000259
3          0.257511  0.278943  0.013209
5          0.594421  0.714841  0.986532



Training using doc2vec_200 data set
The accuracy of the predictions is: 0.72586

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.038627  0.002331  0.000000
3          0.238197  0.188811  0.012173
5          0.723176  0.808858  0.987827

# Decision Tree Classifier

In [None]:
for i in datasets:
    print(f"Training using doc2vec_{i} data set")
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = DecisionTreeClassifier(max_depth=None, criterion="entropy").fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

# Logistic regression using feature selection via chi^2, MI
Marginally increases performance. Possibly due to logisitc regression already weighting good features

In [18]:
from sklearn.feature_selection import SelectKBest, chi2

X_train, X_vali, Y_train, Y_vali = preprocess("count")

x2 = SelectKBest(chi2, k=2000)

X_train_x2 = x2.fit_transform(X_train, Y_train)
X_vali_x2 = x2.transform(X_vali)

print('\033[1m' + f"Trained with COUNT features" '\033[0m')
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train_x2, Y_train)
Y_pred = clf.predict(X_vali_x2)
evaluate(Y_pred, Y_vali.values)

[1mTrained with COUNT features[0m
The accuracy of the predictions is: 0.84230

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.665242  0.056860  0.007239
3          0.207977  0.615023  0.053947
5          0.126781  0.328117  0.938814





In [20]:
X_train, X_vali, Y_train, Y_vali = preprocess("200")

mi = SelectKBest(score_func=mutual_info_classif, k=80)
mi.fit(X_train, Y_train)
X_train_mi = mi.transform(X_train)
X_test_mi = mi.transform(X_vali)

print('\033[1m' + f"Trained with COUNT features" '\033[0m')
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train_mi, Y_train)
Y_pred = clf.predict(X_test_mi)
evaluate(Y_pred, Y_vali.values)

[1mTrained with COUNT features[0m
The accuracy of the predictions is: 0.80489

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.478632  0.061033  0.008790
3          0.276353  0.518519  0.052223
5          0.245014  0.420449  0.938987





# Random forest with feature selection, via chi^2, MI
Increases performance by 2-4% with some k settings on the doc2vecs and count

In [113]:

print(f"Training using doc2vec_count data set")
X_train, X_vali, Y_train, Y_vali = preprocess("count")
    
x2 = SelectKBest(chi2, k=500)
X_train_x2 = x2.fit_transform(X_train, Y_train)
X_vali_x2 = x2.transform(X_vali)
    
clf = RandomForestClassifier(n_estimators=100).fit(X_train_x2, Y_train)
Y_pred = clf.predict(X_vali_x2)
evaluate(Y_pred, Y_vali.values)

Training using doc2vec_count data set
The accuracy of the predictions is: 0.79017

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.388412  0.017871  0.004403
3          0.287554  0.445998  0.042217
5          0.324034  0.536131  0.953380





In [111]:
print(f"Training using doc2vec_50 data set")
X_train, X_vali, Y_train, Y_vali = preprocess("200")

mi = SelectKBest(score_func=mutual_info_classif, k=len())
mi.fit(X_train, Y_train)
X_train_mi = mi.transform(X_train)
X_test_mi = mi.transform(X_vali)

clf = RandomForestClassifier(n_estimators=100).fit(X_train_mi, Y_train)
Y_pred = clf.predict(X_test_mi)
evaluate(Y_pred, Y_vali.values)

Training using doc2vec_50 data set
The accuracy of the predictions is: 0.74279

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.075107  0.006993  0.000259
3          0.343348  0.272727  0.019684
5          0.581545  0.720280  0.980057





# Using SVMs with a reduced dataset, using feature selection chi2, MI. 
Lets us run SVM classifiers in reasonable time by using a reduced feature set


In [124]:
# Can reduce the time?
print(f"Training using doc2vec_100 data set")
X_train, X_vali, Y_train, Y_vali = preprocess("100")
    
mi = SelectKBest(score_func=mutual_info_classif, k=70)
mi.fit(X_train, Y_train)
X_train_mi = mi.transform(X_train)
X_test_mi = mi.transform(X_vali)
    
clf = LinearSVC(max_iter=10000).fit(X_train_mi, Y_train)
Y_pred = clf.predict(X_test_mi)
evaluate(Y_pred, Y_vali.values)

Training using doc2vec_100 data set
The accuracy of the predictions is: 0.81350

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.405579  0.043512  0.004662
3          0.313305  0.549340  0.044548
5          0.281116  0.407148  0.950790





In [125]:
print(f"Training using doc2vec_count data set")
X_train, X_vali, Y_train, Y_vali = preprocess("count")

x2 = SelectKBest(chi2, k=5000)
X_train_x2 = x2.fit_transform(X_train, Y_train)
X_vali_x2 = x2.transform(X_vali)

clf = SVC(kernel='linear', C=1.0).fit(X_train_x2, Y_train)
Y_pred = clf.predict(X_vali_x2)
evaluate(Y_pred, Y_vali.values)

Training using doc2vec_count data set
The accuracy of the predictions is: 0.83221

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.660944  0.073038  0.011137
3          0.227468  0.613831  0.063196
5          0.111588  0.313131  0.925667





In [126]:
print(f"Training using doc2vec_200 data set")
X_train, X_vali, Y_train, Y_vali = preprocess("count")

mi = SelectKBest(score_func=mutual_info_classif, k=50)
mi.fit(X_train, Y_train)
X_train_mi = mi.transform(X_train)
X_test_mi = mi.transform(X_vali)

clf = SVC(kernel='rbf', degree=3, gamma='auto', C=1.0).fit(X_train_mi, Y_train)
Y_pred = clf.predict(X_test_mi)
evaluate(Y_pred, Y_vali.values)

Training using doc2vec_200 data set


KeyboardInterrupt: 

In [None]:
X_train.shape

# Try use some boosting models now

# XGBOOST

In [11]:
import sys
!{sys.executable} -m pip install xgboost
from xgboost import XGBClassifier



In [29]:

for i in datasets:
    if(i != "count"):
        print(f"Training using doc2vec_{i} with XGBoost")
        X_train, X_vali, Y_train, Y_vali = preprocess(i)
        clf = XGBClassifier(nrounds=100).fit(X_train, Y_train)
        Y_pred = clf.predict(X_vali)
        evaluate(Y_pred, Y_vali.values)

    


Training using doc2vec_50 with XGBoost
Parameters: { nrounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


The accuracy of the predictions is: 0.80905

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.480687  0.046620  0.006216
3          0.309013  0.538462  0.054908
5          0.210300  0.414918  0.938876



Training using doc2vec_100 with XGBoost
Parameters: { nrounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


The accuracy of the predictions is: 0.81600

Confusion M

In [30]:
print(f"Training using doc2vec_count with XGBoost")
X_train, X_vali, Y_train, Y_vali = preprocess("count")
clf = XGBClassifier().fit(X_train, Y_train)
Y_pred = clf.predict(X_vali)
evaluate(Y_pred, Y_vali.values)

Training using doc2vec_count with XGBoost
The accuracy of the predictions is: 0.82900

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.553648  0.037296  0.003367
3          0.208155  0.554779  0.042994
5          0.238197  0.407925  0.953639





# Try stacking?

In [None]:
from sklearn.metrics import accuracy_score

np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    


classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
                MultinomialNB(),
                #XGBClassifier(),
                #SVC(kernel='linear', C=1.0, probability=True),
                RandomForestClassifier(n_estimators=100)]

titles = ['Logistic Regression',
          'Multinomial NB',
          #'XGBoost',
          #'Linear SVC kernel',
          'Forest']

meta_classifier = LogisticRegression(solver='lbfgs')
stacker = StackingClassifier(classifiers, meta_classifier)

In [None]:
X_train, X_vali, Y_train, Y_vali = preprocess("100")

for title,clf in zip(titles,classifiers):
    clf.fit(X_train, Y_train)
    print(title, "Accuracy:",clf.score(X_vali, Y_vali))
    
stacker.fit(X_train, Y_train)
print('\nStacker Accuracy:', stacker.score(X_vali, Y_vali))

# Random stacking sets

In [None]:
classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
                MultinomialNB(),
                XGBClassifier(),
                SVC(kernel='linear', C=1.0, probability=True),
                SVC(kernel='rbf', gamma=0.7, C=1.0),
                SVC(kernel='poly', degree=3, gamma='auto', C=1.0),
                RandomForestClassifier(n_estimators=100)]

titles = ['Logistic Regression',
          'Multinomial NB',
          'XGBoost',
          'Linear SVC kernel',
          'RBF SVC kernel',
          'Poly SVC kernel',
          'Forest']

meta_classifier = LogisticRegression(solver='lbfgs')
stacker = StackingClassifier(classifiers, meta_classifier)


X_train, X_vali, Y_train, Y_vali = preprocess("count")

for title,clf in zip(titles,classifiers):
    clf.fit(X_train, Y_train)
    print(title, "Accuracy:",clf.score(X_vali, Y_vali))
    
stacker.fit(X_train, Y_train)
print('\nStacker Accuracy:', stacker.score(X_vali, Y_vali))

In [None]:
classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
                MultinomialNB(),
                XGBClassifier(),
                #SVC(kernel='linear', C=1.0, probability=True),
                #SVC(kernel='rbf', gamma=0.7, C=C, probability=True),
                SVC(kernel='poly', degree=3, gamma='auto', C=1.0, probability=True),
                RandomForestClassifier(n_estimators=100)]

titles = ['Logistic Regression',
          'Multinomial NB',
          'XGBoost',
          #'Linear SVC kernel',
          #'RBF SVC kernel',
          'Poly SVC kernel',
          'Forest']

meta_classifier = LogisticRegression(solver='lbfgs')
stacker = StackingClassifier(classifiers, meta_classifier)


X_train, X_vali, Y_train, Y_vali = preprocess("count")

for title,clf in zip(titles,classifiers):
    clf.fit(X_train, Y_train)
    print(title, "Accuracy:",clf.score(X_vali, Y_vali))
    
stacker.fit(X_train, Y_train)
print('\nStacker Accuracy:', stacker.score(X_vali, Y_vali))

# Use a vectoriser vs "count"


In [108]:
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_vali, Y_train, y_vali = train_test_split(train_data["review"], train_data["rating"], test_size=0.20, random_state=8579)

X_train_txt, y_train = np.array(X_train), np.array(Y_train)
X_test_txt, y_test = np.array(X_vali), np.array(y_vali)

vectoriser = CountVectorizer(ngram_range=(1, 2), stop_words = 'english')
vectoriser.fit(X_train_txt)
X_train = vectoriser.transform(X_train_txt)
X_test = vectoriser.transform(X_test_txt)

clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
evaluate(Y_pred, y_test)


The accuracy of the predictions is: 0.85340

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.628755  0.048951  0.003367
3          0.208155  0.639472  0.044807
5          0.163090  0.311577  0.951826





In [None]:
def splitVectorised(ngram_range = (1,1), randomstate = 8579):
    # Split the reviews and ratings
    X_train, X_vali, Y_train, y_vali = train_test_split(train_data["review"], train_data["rating"], test_size=0.20, random_state=randomstate)
    X_train_txt, y_train = np.array(X_train), np.array(X_train)
    X_test_txt, y_test = np.array(X_vali), np.array(y_vali)
        
    # vectorise the reviews
    vectoriser = CountVectorizer(ngram_range=ngram_range, stop_words='english')
    vectoriser.fit(X_train_txt)
    X_train = vectoriser.transform(X_train_txt)
    X_test = vectoriser.transform(X_test_txt)
    
    return X_train, X_test, Y_train, y_test

# Takeaways:
- SVC with kernel = linear better than rest
- Countvectoriser() > sparse matrix given
- better to use voting features
- feature selection is potential