# Reading in data and imports

In [39]:
# Need
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt

#Natural language toolkit. Download if not installed already
import nltk
from nltk import WordNetLemmatizer
#nltk.download('stopwords')
#nltk.download('wordnet')

# For splitting by punctuation and using regex
import re
import string

# Models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import LinearSVC, SVC

# Evaluation and feature selection tools
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_selection import chi2, mutual_info_classif

In [40]:
# Test not useful right now
test_meta = pd.read_csv("review_meta_test.csv", sep=",", names=["label", "body_text"])
test_text = pd.read_csv("review_meta_train.csv", sep=',', names=["label", "body_text"])

# Also not useful right now
train_meta = pd.read_csv("review_meta_train.csv", sep=",")
train_text = pd.read_csv("review_text_train.csv", sep=",")

# Ids are not predictive. Consider adding in the date?
train_meta = train_meta.drop(["date", "review_id", "reviewer_id", "business_id"], axis = 1)

# Combine the meta features and text. Move class to end
train_data = pd.concat([train_meta, train_text], axis=1)
class_col = train_data.pop("rating")
train_data["rating"] = class_col
train_data.head()

Unnamed: 0,vote_funny,vote_cool,vote_useful,review,rating
0,0,1,3,dear longman & eagle.......you've left me no c...,1
1,0,0,0,Delish. The hubby and I wanted to do brunch on...,5
2,1,0,1,"yep, I've giving Yolk 5 stars. It's just reall...",5
3,17,3,3,"Meat, meat, meat. It's meat-tastic. So much me...",3
4,0,0,0,I caught up with the law school girls on a Sat...,3


# Preprocessing the review text.
We want to remove punctuation and numbers, tokenise every review's words, lemmatise the words, then remove all the words in the stopwords list

In [41]:
# This code will take extremely long to execute. Might be useful later if we're generating our own features.
# Make sure to run once and export to csv
# Just use the vectors provided for now

punct = string.punctuation
stopwords = nltk.corpus.stopwords.words("english")
wordnet = WordNetLemmatizer()

# Preprocesses each review in the dataframe
def clean_review(review):
    # Reassign the string after every change
    
    # Remove puncutation and numbers
    no_punct = ''.join(char for char in review if char not in punct and not char.isdigit())
    
    # Tokenize into words
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    words = tokenizer.tokenize(no_punct)
    
    # Remove stopwords and lemmatize
    no_stopwords = [wordnet.lemmatize(word) for word in words if word not in stopwords]
    
    return no_stopwords

In [None]:
# Clean each review in the dataframe?
train_data.loc[:3,"review"] = train_data.review.apply(lambda x : clean_review(x))

Just use the vectors given to us...

In [None]:
# OR we can just use the preprocessed text data

# countvec
import pickle
vocab = pickle.load(open("review_text_features_countvec/train_countvectorizer.pkl", "rb"))
vocab_dict = vocab.vocabulary_
sparse_matrix_train = scipy.sparse.load_npz("review_text_features_countvec/review_text_train_vec.npz")
sparse_matrix_test = scipy.sparse.load_npz("review_text_features_countvec/review_text_test_vec.npz")

# doc2vec 50, 100, 200 features vector for training
d2v_50_train = pd.read_csv(r"review_text_features_doc2vec50/review_text_train_doc2vec50.csv", index_col = False, delimiter = ",", header=None)
d2v_100_train = pd.read_csv(r"review_text_features_doc2vec100/review_text_train_doc2vec100.csv", index_col = False, delimiter = ",", header=None)
d2v_200_train = pd.read_csv(r"review_text_features_doc2vec200/review_text_train_doc2vec200.csv", index_col = False, delimiter = ",", header=None)

# doc2vec 50, 100, 200 features vector for testing
d2v_50_test = pd.read_csv(r"review_text_features_doc2vec50/review_text_test_doc2vec50.csv", index_col = False, delimiter = ",", header=None)
d2v_100_test = pd.read_csv(r"review_text_features_doc2vec100/review_text_test_doc2vec100.csv", index_col = False, delimiter = ",", header=None)
d2v_200_test = pd.read_csv(r"review_text_features_doc2vec200/review_text_test_doc2vec200.csv", index_col = False, delimiter = ",", header=None)

# Split data and form train and validation sets


In [33]:
# Here, choose which predefined and given vector to use to represent the text data.
# Text here is already processed

# Function to combine meta features (optional?)  
def preprocess(type):
    # For each of the doc2vec vectors, concat with meta features
    
    if type == "50":
        features = d2v_50_train
        train = pd.concat([features.reset_index(), train_meta.reset_index()], axis = 1)
        
    elif type == "100" :
        features = d2v_100_train
        train = pd.concat([features.reset_index(), train_meta.reset_index()], axis = 1)
        
    elif type == "200" :
        features = d2v_200_train
        train = pd.concat([features.reset_index(), train_meta.reset_index()], axis = 1)
    
    # If we chose a doc2vec vector, split and return the training and validation sets
    if type != "count":
        X_train, X_vali, Y_train, Y_vali = train_test_split(train[train.columns[:-1]],
                                                        train["rating"], test_size=0.30, random_state=12387)
        return X_train, X_vali, Y_train, Y_vali
    
    # Otherwise do something with the sparse matrix
    elif type == "count":
        X_train, X_vali, Y_train, Y_vali = train_test_split()
        return sparse_matrix_train



# Fit and train models

In [34]:
def evaluate(truthlist, predictions):
    # First calculate a crude accuracy score
    correct = 0;
    wrong = 0;
    for i in range(0,len(truthlist)):
        if(truthlist[i] == predictions[i]):
            correct += 1
        else:
            wrong += 1;
    print("The accuracy of the predictions is: {:.5f}\n".format(correct/(correct + wrong)))
        
    # Now construct a confusion matrix of each attribute
    truthSeries = pd.Series(truthlist, name = "Truths")
    predictionSeries = pd.Series(predictions, name = "Predictions")
    
    # Now normalise the confusion matrix so its a percentage of classification performance
    confusionDf = pd.crosstab(truthSeries, predictionSeries, rownames=["Truths"], colnames=["Predicted"], margins=False)
    confusionDfNormalised = confusionDf / confusionDf.sum(axis=0)
    print("Confusion Matrix of Correctly Labeled Classes %'s\n")
    print(confusionDfNormalised)
    print("\n\n")

    
    return


# Gaussian Naive bayes

In [35]:
category = ["1", "3", "5"]
datasets = ["50", "100", "200"] # don't include count for now

for i in datasets:
    if(i != "count"):
        print('\033[1m' + f"Trained with doc2vec_{i} features" '\033[0m')
        X_train, X_vali, Y_train, Y_vali = preprocess(i)
        clf = GaussianNB().fit(X_train, Y_train)
        Y_pred = clf.predict(X_vali)
        evaluate(Y_pred, Y_vali.values)

[1mTrained with doc2vec_50 features[0m
The accuracy of the predictions is: 0.72379

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.390013  0.089777  0.037198
3          0.333333  0.566684  0.143403
5          0.276653  0.343539  0.819399



[1mTrained with doc2vec_100 features[0m
The accuracy of the predictions is: 0.68400

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.431849  0.110535  0.057188
3          0.232119  0.500778  0.164957
5          0.336032  0.388687  0.777855



[1mTrained with doc2vec_200 features[0m
The accuracy of the predictions is: 0.64541

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.443995  0.138557  0.079611
3          0.140351  0.407369  0.169303
5          0.415655  

# Multinomial Naive bayes

In [36]:
#    print('\033[1m' + f"Trained with count vectoriser" '\033[0m')
#    X_train, X_vali, Y_train, Y_vali = preprocess("count")
#    clf = GaussianNB().fit(X_train, Y_train)
#    Y_pred = clf.predict(X_vali)
#    evaluate(Y_pred, Y_vali.values)

# Logistic Regression

In [37]:
for i in datasets:
    print('\033[1m' + f"Trained with doc2vec_{i} features" '\033[0m')
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

[1mTrained with doc2vec_50 features[0m
The accuracy of the predictions is: 0.77556

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.147099  0.010379  0.000348
3          0.464238  0.464453  0.038936
5          0.388664  0.525169  0.960716



[1mTrained with doc2vec_100 features[0m
The accuracy of the predictions is: 0.80596

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.342780  0.039958  0.003998
3          0.390013  0.576544  0.053537
5          0.267206  0.383498  0.942465



[1mTrained with doc2vec_200 features[0m
The accuracy of the predictions is: 0.82389

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.441296  0.034769  0.003303
3          0.333333  0.644006  0.063271
5          0.225371  