# Imports


In [43]:
# Need these
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt

#Natural language toolkit. Download if not installed already
import nltk
from nltk import WordNetLemmatizer
#nltk.download('stopwords')
#nltk.download('wordnet')

# For splitting by punctuation and using regex
import re
import string

# Useful
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# Potentially used Models
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import LinearSVC, SVC

# Evaluation and feature selection tools
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_selection import chi2, mutual_info_classif

# Files

In [62]:
# Test not useful right now
test_meta = pd.read_csv("review_meta_test.csv", sep=",")
test_text = pd.read_csv("review_text_test.csv", sep=',')

# Also not useful right now
train_meta = pd.read_csv("review_meta_train.csv", sep=",")
train_text = pd.read_csv("review_text_train.csv", sep=",")

# Ids are not predictive. Consider adding in the date?
train_meta = train_meta.drop(["date", "review_id", "reviewer_id", "business_id"], axis = 1)

# Combine the meta features and text. MoveA class to end
train_data = pd.concat([train_meta, train_text], axis=1)
class_col = train_data.pop("rating")
train_data["rating"] = class_col
train_data.head()

Unnamed: 0,review
0,It is 10am on a Monday morning and my wife say...
1,I came here with a friend for her work thing -...
2,ATTENTION!!! DO NOT GO TO THIS RESTAURANT EVER...
3,"I agree, with Jonathan S. - this place is a 3...."
4,"First visit to Chicago, and a friend recommend..."
5,I was so impressed with this place! The servic...
6,"Fantastic little place! The food was so CHEAP,..."
7,One of the best dogs I've ever had. EVER! To c...
8,As a New Yorker visiting Chicago I went to The...
9,had the pleasure of going back to the gage thi...


In [3]:
# Seems like manually using countvectoriser() is better than sparse matrix

# countvec
import pickle
vocab = pickle.load(open("review_text_features_countvec/train_countvectorizer.pkl", "rb"))
vocab_dict = vocab.vocabulary_
sparse_matrix_train = scipy.sparse.load_npz("review_text_features_countvec/review_text_train_vec.npz")
sparse_matrix_test = scipy.sparse.load_npz("review_text_features_countvec/review_text_test_vec.npz")

# doc2vec 50, 100, 200 features vector for training
d2v_50_train = pd.read_csv(r"review_text_features_doc2vec50/review_text_train_doc2vec50.csv", index_col = False, delimiter = ",", header=None)
d2v_100_train = pd.read_csv(r"review_text_features_doc2vec100/review_text_train_doc2vec100.csv", index_col = False, delimiter = ",", header=None)
d2v_200_train = pd.read_csv(r"review_text_features_doc2vec200/review_text_train_doc2vec200.csv", index_col = False, delimiter = ",", header=None)

# doc2vec 50, 100, 200 features vector for testing
d2v_50_test = pd.read_csv(r"review_text_features_doc2vec50/review_text_test_doc2vec50.csv", index_col = False, delimiter = ",", header=None)
d2v_100_test = pd.read_csv(r"review_text_features_doc2vec100/review_text_test_doc2vec100.csv", index_col = False, delimiter = ",", header=None)
d2v_200_test = pd.read_csv(r"review_text_features_doc2vec200/review_text_test_doc2vec200.csv", index_col = False, delimiter = ",", header=None)
sparse_matrix_train.shape



(28068, 41648)

# Preprocessing
consider in countvectoriser: not using stopwords param, adding lemmatizer param

In [64]:
# Here, choose which predefined and given vector to use to represent the text data.
# Text here is already processed

d2vtrain = [d2v_50_train, d2v_100_train, d2v_200_train]
d2v_50_train.name = "d2v_50"
d2v_100_train.name = "d2v_100"
d2v_200_train.name = "d2v_200"

# Function to split a doc2Vec vector into training and testing
def splitDoc2Vec(doc, randomstate = 8579):
    # For each of the doc2vec vectors, concat with meta features
    train = doc
    train = pd.concat([train.reset_index(), train_meta.reset_index()], axis = 1)
    
    # Above concat method causes duplicating index column
    train = train.loc[:,~train.columns.duplicated()]
    train = train.drop(columns = ['index'])
    
    # Split into training and validation sets
    X_train, X_test, Y_train, Y_test = train_test_split(train[train.columns[:-1]],
                                                        train["rating"], test_size=0.20, random_state=randomstate)
    return X_train, X_test, Y_train, np.array(Y_test)

# Function to split the given sparse matrix into training and testing
def splitSparse(randomstate = 8579):
    X_train, X_vali, Y_train, Y_vali = train_test_split(sparse_matrix_train, 
                                                            train_data["rating"], test_size=0.20, random_state=randomstate)
    return X_train, X_vali, Y_train, np.array(Y_vali)

# Function to split the training data into train and test, but first vectorising the reviews
def splitVectorised(ngram_range = (1,1), randomstate = 8579):
    # Split the reviews and ratings
    X_train, X_vali, Y_train, y_vali = train_test_split(train_data["review"], train_data["rating"], test_size=0.20, random_state=randomstate)
    X_train_txt, y_train = np.array(X_train), np.array(X_train)
    X_test_txt, y_test = np.array(X_vali), np.array(y_vali)
        
    # vectorise the reviews
    vectoriser = CountVectorizer(ngram_range=ngram_range) #optional stop_words = 'english', tokenizer=LemmaTokenizer()
    vectoriser.fit(X_train_txt)
    X_train = vectoriser.transform(X_train_txt)
    X_test = vectoriser.transform(X_test_txt)
    
    # Need to return vectoriser fit external test set
    return X_train, X_test, Y_train, y_test, vectoriser

# Run classifiers using a single train test split to get some crude results. Cross validation comes later

In [5]:
# Crude accuracy score and confusion matrix of predictions with % correct. Matrix is "Predicted label x for an instance
# with Class label y"

def evaluate(truthlist, predictions):
    # First calculate a crude accuracy score
    correct = 0;
    wrong = 0;
    for i in range(0,len(truthlist)):
        if(truthlist[i] == predictions[i]):
            correct += 1
        else:
            wrong += 1;
    print("The accuracy of the predictions is: {:.5f}\n".format(correct/(correct + wrong)))
        
    # Now construct a confusion matrix of each attribute
    truthSeries = pd.Series(truthlist, name = "Truths")
    predictionSeries = pd.Series(predictions, name = "Predictions")
    
    # Now normalise the confusion matrix so its a percentage of classification performance
    confusionDf = pd.crosstab(truthSeries, predictionSeries, rownames=["Truths"], colnames=["Predicted"], margins=False)
    confusionDfNormalised = confusionDf / confusionDf.sum(axis=0)
    print("Confusion Matrix of Correctly Labeled Classes %'s\n")
    print(confusionDfNormalised)
    print("\n\n")
    
    return

# Lemmatizer if wanted in countvectoriser()

In [6]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
import nltk
nltk.download('punkt')
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alecy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# MNB
MNB runs ~84% with ngrams (1,1). ~72% with ngrams(1,2), ~69% with ngrams (1,3)

In [None]:
print('\033[1m' + f"Trained with count vectoriser" '\033[0m')
X_train, X_vali, Y_train, Y_vali = splitVectorised((1,1))
clf = MultinomialNB().fit(X_train, Y_train)
Y_pred = clf.predict(X_vali)
evaluate(Y_pred, Y_vali)

print('\033[1m' + f"Trained with sparse matrix" '\033[0m')
X_train, X_vali, Y_train, Y_vali = splitSparse()
clf = MultinomialNB().fit(X_train, Y_train)
Y_pred = clf.predict(X_vali)
evaluate(Y_pred, Y_vali)

# Logistic regression

In [65]:
# Doc 2 vecs
for d2v in d2vtrain:
    print('\033[1m' + f"Trained with {d2v.name} features" '\033[0m')
    X_train, X_vali, Y_train, Y_vali = splitDoc2Vec(d2v)
    clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali)
    
# using count vectoriser
print('\033[1m' + f"Trained with vectorised features" '\033[0m')
X_train, X_vali, Y_train, Y_vali = splitVectorised((1,2))
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train, Y_train)
Y_pred = clf.predict(X_vali)
evaluate(Y_pred, Y_vali)

[1mTrained with d2v_50 features[0m
The accuracy of the predictions is: 0.82027

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.532189  0.062937  0.006475
3          0.276824  0.566434  0.053872
5          0.190987  0.370629  0.939653



[1mTrained with d2v_100 features[0m
The accuracy of the predictions is: 0.83203

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.575107  0.059052  0.006734
3          0.244635  0.604507  0.054390
5          0.180258  0.336441  0.938876



[1mTrained with d2v_200 features[0m
The accuracy of the predictions is: 0.83060

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.570815  0.059829  0.008547
3          0.244635  0.606838  0.054908
5          0.184549  0.333333  0.

ValueError: too many values to unpack (expected 4)

# SVM linear kernel with feature selection. not feasible otherwise

In [None]:
#Doc2vecs
for d2v in d2vtrain:
    # Can reduce the time?
    print(f"Training using {d2v.name} data set")
    X_train, X_vali, Y_train, Y_vali = splitDoc2Vec(d2v)

    mi = SelectKBest(score_func=mutual_info_classif, k=50)
    mi.fit(X_train, Y_train)
    X_train_mi = mi.transform(X_train)
    X_test_mi = mi.transform(X_vali)

    clf = LinearSVC(max_iter=10000).fit(X_train_mi, Y_train)
    Y_pred = clf.predict(X_test_mi)
    evaluate(Y_pred, Y_vali)
    

In [None]:
#Countvectoriser. takes ~15 minutes for 1k best
print(f"Training using countvectoriser data set")
X_train, X_vali, Y_train, Y_vali = splitVectorised()

x2 = SelectKBest(chi2, k=500)
X_train_x2 = x2.fit_transform(X_train, Y_train)
X_vali_x2 = x2.transform(X_vali)

clf = SVC(kernel='linear', C=1.0).fit(X_train_x2, Y_train)
Y_pred = clf.predict(X_vali_x2)
evaluate(Y_pred, Y_vali)

# XGBOOST

In [8]:
import sys
!{sys.executable} -m pip install xgboost
from xgboost import XGBClassifier



In [None]:
# Doc2vecs
for d2v in d2vtrain:
    print(f"Training using {d2v.name} with XGBoost")
    X_train, X_vali, Y_train, Y_vali = splitDoc2Vec(d2v)
    clf = XGBClassifier(nrounds=200).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali)


In [None]:
# Count vectoriser
print(f"Training using count_vectorised with XGBoost")
X_train, X_vali, Y_train, Y_vali = splitVectorised()
clf = XGBClassifier().fit(X_train, Y_train)
Y_pred = clf.predict(X_vali)
evaluate(Y_pred, Y_vali)

#Sparse
print(f"Training using sparse with XGBoost")
X_train, X_vali, Y_train, Y_vali = splitSparse()
clf = XGBClassifier().fit(X_train, Y_train)
Y_pred = clf.predict(X_vali)
evaluate(Y_pred, Y_vali)

# Random forest with feature selection.
much better with feature selection


In [None]:
for d2v in d2vtrain:
    print(f"Training using {d2v.name} data set")
    X_train, X_vali, Y_train, Y_vali = splitDoc2Vec(d2v)

    mi = SelectKBest(score_func=mutual_info_classif, k=(int)(len(d2v.columns)/2))
    mi.fit(X_train, Y_train)
    X_train_mi = mi.transform(X_train)
    X_test_mi = mi.transform(X_vali)

    clf = RandomForestClassifier(n_estimators=100).fit(X_train_mi, Y_train)
    Y_pred = clf.predict(X_test_mi)
    evaluate(Y_pred, Y_vali)


In [78]:
#Vectorised
print(f"Training using count_vectorised data set")
X_train, X_vali, Y_train, Y_vali = splitVectorised()
    
x2 = SelectKBest(chi2, k=500)
X_train_x2 = x2.fit_transform(X_train, Y_train)
X_vali_x2 = x2.transform(X_vali)
    
clf = RandomForestClassifier().fit(X_train_x2, Y_train)
Y_pred = clf.predict(X_vali_x2)
evaluate(Y_pred, Y_vali)

Training using count_vectorised data set
The accuracy of the predictions is: 0.79498

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.291845  0.007770  0.000518
3          0.210300  0.422688  0.019684
5          0.497854  0.569542  0.979798





# CROSS VALIDATION FOR ALL MODELS

In [9]:
from sklearn.utils import shuffle
import time

classifiers = {
    #"MNB": MultinomialNB(),
    "RFC": RandomForestClassifier(max_depth = 50),
    "XGB": XGBClassifier(),
    "LR": LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, C = 0.1),
    "SVMLINEAR":  SVC(kernel='linear', C=1.0)
}


Training using Doc2vecs. Note that MNB doesn't work here as there are negative and continuous values


In [None]:
for d2v in d2vtrain:
    
    # Include the meta features with the doc2vec
    dataframe = pd.concat([d2v.reset_index(), train_meta.reset_index()], axis = 1)
    # Above concat method causes duplicating index column
    dataframe = dataframe.loc[:,~dataframe.columns.duplicated()]
    dataframe = dataframe.drop(columns = ['index'])
    
    train = shuffle(dataframe, random_state = 0)
    folds = np.array_split(train, 5)
    
    for classifier in list(classifiers.keys()):
        print(f"TESTING ON {d2v.name}: ")
        clf = classifiers[classifier]

        test_scores = []

        for fold_n in range(5):

            train_folds = [folds[i] for i in range(len(folds)) if i != fold_n]
            train_fold = pd.concat(train_folds, axis = 0)
            test_fold = folds[fold_n]

            # split and train the model. Include all columns other than rating in training
            X_train, y_train = train_fold[train_fold.columns.difference(["rating"])], np.array(train_fold["rating"])
            X_test, y_test = test_fold[train_fold.columns.difference(["rating"])], np.array(test_fold["rating"])

            # Record some stats
            before = time.time()
            clf.fit(X_train, y_train)
            after = time.time()
            train_time = after - before

            train_score = clf.score(X_train, y_train)
            test_score = clf.score(X_test, y_test)

            test_scores.append(test_score)

            print(
                '''
                {} took {:.2f} seconds to train
                and has training accuracy {:.3f} and testing accuracy {:.3f}
                - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
                '''.format(classifier, train_time, train_score, test_score)
            )

        mean_test_score = np.mean(test_scores)

        print("the mean test score for {} was {:.3f}".format(classifier, mean_test_score))

TESTING ON d2v_50: 

                RFC took 14.20 seconds to train
                and has training accuracy 1.000 and testing accuracy 0.780
                - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
                

                RFC took 14.00 seconds to train
                and has training accuracy 1.000 and testing accuracy 0.774
                - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
                

                RFC took 13.96 seconds to train
                and has training accuracy 1.000 and testing accuracy 0.777
                - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
                

                RFC took 14.57 seconds to train
                and has training accuracy 1.000 and testing accuracy 0.773
                - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
                

                RFC took 15.31 seconds to train
                and has training accuracy 1


                SVMLINEAR took 189.04 seconds to train
                and has training accuracy 0.837 and testing accuracy 0.828
                - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
                

                SVMLINEAR took 185.77 seconds to train
                and has training accuracy 0.836 and testing accuracy 0.830
                - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
                


training using count_vectorised

In [95]:
train = shuffle(train_data[['rating', 'review']], random_state = 0)
folds = np.array_split(train, 5)

In [None]:
print("TESTING ON COUNTVECTORIZER")

for classifier in list(classifiers.keys()):
    
    clf = classifiers[classifier]
    
    test_scores = []
    
    for fold_n in range(5):
        
        train_folds = [folds[i] for i in range(len(folds)) if i != fold_n]
        train_fold = pd.concat(train_folds, axis = 0)
        test_fold = folds[fold_n]

        X_train_txt, y_train = train_fold["review"], np.array(train_fold["rating"])
        X_test_txt, y_test = test_fold["review"], np.array(test_fold["rating"])
        
        vectorser = CountVectorizer(ngram_range=(1, 1))
        vectoriser.fit(X_train_txt)
        X_train = vectoriser.transform(X_train_txt)
        X_test = vectoriser.transform(X_test_txt)
        
        # If its the RFC, optimise it by feature selecting
        if(classifier == "RFC"):
            x2 = SelectKBest(chi2, k=1000)
            X_train = x2.fit_transform(X_train, y_train)
            X_test = x2.transform(X_test)

        before = time.time()
        clf.fit(X_train, y_train)
        after = time.time()
        train_time = after - before

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        
        test_scores.append(test_score)
        
        print(
            '''
            {} took {:.2f} seconds to train
            and has training accuracy {:.3f} and testing accuracy {:.3f}
            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
            '''.format(classifier, train_time, train_score, test_score)
        )
    
    mean_test_score = np.mean(test_scores)
    
    print("the mean test score for {} was {:.3f}".format(classifier, mean_test_score))

TESTING ON COUNTVECTORIZER

            RFC took 36.19 seconds to train
            and has training accuracy 0.987 and testing accuracy 0.786
            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
            

            RFC took 36.41 seconds to train
            and has training accuracy 0.987 and testing accuracy 0.784
            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
            

            RFC took 36.90 seconds to train
            and has training accuracy 0.987 and testing accuracy 0.786
            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
            

            RFC took 36.54 seconds to train
            and has training accuracy 0.987 and testing accuracy 0.781
            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
            

            RFC took 37.18 seconds to train
            and has training accuracy 0.987 and testing accuracy 0.780
            - - - - - - - - - - -

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



            LR took 4.41 seconds to train
            and has training accuracy 0.950 and testing accuracy 0.862
            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
            


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



            LR took 4.43 seconds to train
            and has training accuracy 0.948 and testing accuracy 0.858
            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
            


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



            LR took 4.01 seconds to train
            and has training accuracy 0.950 and testing accuracy 0.857
            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
            


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



            LR took 4.46 seconds to train
            and has training accuracy 0.950 and testing accuracy 0.855
            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
            


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



            LR took 4.94 seconds to train
            and has training accuracy 0.954 and testing accuracy 0.859
            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
            
the mean test score for LR was 0.858

            SVMLINEAR took 618.65 seconds to train
            and has training accuracy 0.995 and testing accuracy 0.842
            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
            

            SVMLINEAR took 600.04 seconds to train
            and has training accuracy 0.994 and testing accuracy 0.839
            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
            


# Stacking
seems like the doc2vec vectors aren't as informative as the countVectoriser  
best to use ngrams(1,1) for doc2vecs? (1,2) causes marginal difference in other models, but causes around a 10% accuracy dip for multinomial  
RFC gains around 7-8% accuracy if trained using chi2 features  
of course, chi2 can't be used for d2v inputs as they contain negative values  
MNB also can't be used for d2v inputs

All except MNB:  
first do for d2v_50_train  84~  
stacking ~ 85% for 50  
86.2% for 200

In [85]:
vectoriser = CountVectorizer(ngram_range=(1, 2))

scores = []

classifiers = {
    #"MNB": MultinomialNB(),
    "RFC": RandomForestClassifier(max_depth = 50),
    "ADA": AdaBoostClassifier(),
    "XGB": XGBClassifier(),
    "LR": LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, C = 0.1),
    "SVMLINEAR":  SVC(kernel='linear', C=1.0)
}

# Include the meta features with the doc2vec
dataframe = pd.concat([d2v_200_train.reset_index(), train_meta.reset_index()], axis = 1)
# Above concat method causes duplicating index column
dataframe = dataframe.loc[:,~dataframe.columns.duplicated()]
dataframe = dataframe.drop(columns = ['index'])
    
train = shuffle(dataframe, random_state = 0)
folds = np.array_split(train, 5)


# loop

for fold_n in range(5):
    
    print(fold_n)
    
    train_folds = [folds[i] for i in range(len(folds)) if i != fold_n]
    train_fold = pd.concat(train_folds, axis = 0)
    test_fold = folds[fold_n]
    
    meta_train, meta_test = train_test_split(dataframe)
    
    # split and train the model. Include all columns other than rating in training
    X_train, y_train = train_fold[train_fold.columns.difference(["rating"])], np.array(train_fold["rating"])
    X_test, y_test = test_fold[train_fold.columns.difference(["rating"])], np.array(test_fold["rating"])

    before = time.time()
    clfs = []
    predictions_list = []
    
    for classifier in list(classifiers.keys()):

        clf = classifiers[classifier]
        clfs.append(clf)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print("{} accuracy {}".format(classifier, score))
        predictions = clf.predict(X_test)
        predictions_list.append(predictions)

    after = time.time()
    print("TIME: ", after - before)
    meta_clf = RandomForestClassifier()
    meta_clf.fit(np.stack(predictions_list).transpose(), y_test)

    # lets test the meta classifier on the meta test data

    reviews = meta_test
    new_predictions = []
    
    # Predict the test set without rating column
    for clf in clfs:
        predictions = clf.predict(meta_test[meta_test.columns.difference(["rating"])])
        new_predictions.append(predictions)

    score = meta_clf.score(np.stack(new_predictions).transpose(), meta_test.rating)
    scores.append(score)
    print("stacking accuracy is {}".format(score))

0
RFC accuracy 0.7278232988956181
ADA accuracy 0.7502671891699323
XGB accuracy 0.8120769504809405
LR accuracy 0.8350552190951194
SVMLINEAR accuracy 0.8339864624153901
TIME:  706.2092406749725
stacking accuracy is 0.8626193529998575
1


KeyboardInterrupt: 

In [15]:
np.mean(scores)

0.8459455607809605

now try for splitVectorised()
same as mike, but small changes to parameters in model. also trains random forest classifier differently to others (feature selection)   
Using meta_clf = Logit:  
all except mnb give 86.5  
all except svm give 86.7, with lemmatokenizer and stop words 85%   
all except mnb and svm gives 86.6    
all except rfc = 86.5  
using metaclf = XBG:  
all except svm sometimes 77 most times 87.2  

In [36]:
from sklearn.dummy import DummyClassifier

In [90]:
vectoriser = CountVectorizer(ngram_range=(1, 2))

scores = []

classifiers = {
    "MNB": MultinomialNB(),
    "RFC": RandomForestClassifier(max_depth = 50),
    "ADA": AdaBoostClassifier(),
    "XGB": XGBClassifier(),
    "LR": LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, C = 0.1),
    "SVMLINEAR":  SVC(kernel='linear', C=0.1)
}

# loop

train = shuffle(train_data[['rating', 'review']], random_state = 0)
folds = np.array_split(train, 5)

for fold_n in range(5):
    
    print(fold_n)
    
    meta_train, meta_test = train_test_split(train_data[['rating', 'review']])
    train = shuffle(meta_train)
    
    folds = np.array_split(train, 5)

    train_folds = [folds[i] for i in range(len(folds)) if i != fold_n]
    train_fold = pd.concat(train_folds, axis = 0)
    test_fold = folds[fold_n]

    X_train_txt, y_train = train_fold["review"], np.array(train_fold["rating"])
    X_test_txt, y_test = test_fold["review"], np.array(test_fold["rating"])

    vectoriser.fit(X_train_txt)
    X_train = vectoriser.transform(X_train_txt)
    X_test = vectoriser.transform(X_test_txt)

    before = time.time()
    
    clfs = []
    
    predictions_list = []
    
    for classifier in list(classifiers.keys()):

        clf = classifiers[classifier]
        clfs.append(clf)
        
        # could so some feature selection here potentially, or train separate models differently
        if("RFC" in classifiers and classifier == "RFC"):
            # Optimise RFC with feature selection. Train the model using these k features
            x2 = SelectKBest(chi2, k=1000)
            X_train_rfc = x2.fit_transform(X_train, y_train)
            X_test_rfc = x2.transform(X_test)
            
            clf.fit(X_train_rfc, y_train)
            
            score = clf.score(X_test_rfc, y_test)
            print("{} accuracy {}".format(classifier, score))
            
            predictions = clf.predict(X_test_rfc)
            
        else:
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            print("{} accuracy {}".format(classifier, score))
            predictions = clf.predict(X_test)
            
        predictions_list.append(predictions)

    after = time.time()
    print("TIME", after - before)
    
    #meta_clf = RandomForestClassifier()
    #meta_clf = DummyClassifier(strategy="most_frequent")
    #meta_clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, C = 0.1)
    #meta_clf = SVC(kernel='linear', C=1.0)
    meta_clf = XGBClassifier()
    
    meta_clf.fit(np.stack(predictions_list).transpose(), y_test)

    # lets test the meta classifier on the meta test data

    reviews = vectoriser.transform(meta_test.review)
    new_predictions = []
    
    for clf in clfs:
        if("RFC" in classifiers and clf == classifiers["RFC"]):
            # Select best k features in meta_test
            x2 = SelectKBest(chi2, k=1000)
            reviews_rfc = x2.fit_transform(reviews, meta_test.rating)
            predictions = clf.predict(reviews_rfc)
        else:
            predictions = clf.predict(reviews)
            
        new_predictions.append(predictions)

    score = meta_clf.score(np.stack(new_predictions).transpose(), meta_test.rating)
    scores.append(score)
    print("stacking accuracy is {}".format(score))

0
MNB accuracy 0.7537402042270245
RFC accuracy 0.7964853953930183
ADA accuracy 0.7891237235810972
XGB accuracy 0.8537164568985989
LR accuracy 0.8727143196390406
SVMLINEAR accuracy 0.865590121111375
TIME 788.207319021225
stacking accuracy is 0.8540686903235001
1


KeyboardInterrupt: 

# try some kaggle

In [68]:
# Just normal Logistic regression model
# using count vectoriser
print('\033[1m' + f"Trained with vectorised features" '\033[0m')
X_train, X_vali, Y_train, Y_vali, vectoriser = splitVectorised((1,2))
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train, Y_train)
Y_pred = clf.predict(X_vali)
evaluate(Y_pred, Y_vali)

[1mTrained with vectorised features[0m
The accuracy of the predictions is: 0.87068

Confusion Matrix of Correctly Labeled Classes %'s

Predicted         1         3         5
Truths                                 
1          0.669528  0.053613  0.003885
3          0.193133  0.692308  0.041699
5          0.137339  0.254079  0.954416





In [69]:
X_test = vectoriser.transform(test_text.review)
Y_pred = clf.predict(X_test)
Y_pred

array([5, 5, 1, ..., 3, 5, 5], dtype=int64)

In [73]:
predictionsdf = pd.DataFrame(list(enumerate(Y_pred, start = 1)))
predictionsdf = predictionsdf.rename(columns = {0: 'instance_id', 1: 'rating'})
predictionsdf.to_csv("preds1.csv", index = False)
predictionsdf

Unnamed: 0,instance_id,rating
0,1,5
1,2,5
2,3,1
3,4,3
4,5,5
5,6,5
6,7,5
7,8,5
8,9,5
9,10,5


In [80]:
# Try the stacking model now

vectoriser = CountVectorizer(ngram_range=(1, 2))

scores = []

classifiers = {
    #"MNB": MultinomialNB(),
    #"RFC": RandomForestClassifier(max_depth = 50),
    #"ADA": AdaBoostClassifier(),
    #"XGB": XGBClassifier(),
    "LR": LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, C = 0.1)
    #"SVMLINEAR":  SVC(kernel='linear', C=1.0)
}

# loop


meta_train, meta_test = train_test_split(train_data[['rating', 'review']])
train = shuffle(meta_train)
    
vectoriser.fit(X_train_txt)
X_train = vectoriser.transform(meta_train.review)
Y_train = meta_train.rating

X_test = vectoriser.transform(meta_test.review)
y_test = meta_test.rating
    
clfs = []

predictions_list = []
    
for classifier in list(classifiers.keys()):

    clf = classifiers[classifier]
    clfs.append(clf)
        
    # could so some feature selection here potentially, or train separate models differently
    if("RFC" in classifiers and classifier == "RFC"):
        # Optimise RFC with feature selection. Train the model using these k features
        x2 = SelectKBest(chi2, k=1000)
        X_train_rfc = x2.fit_transform(X_train, y_train)
        X_test_rfc = x2.transform(X_test)
            
        clf.fit(X_train_rfc, Y_train)
            
        score = clf.score(X_test_rfc, y_test)
        print("{} accuracy {}".format(classifier, score))
            
        predictions = clf.predict(X_test_rfc)
            
    else:
        clf.fit(X_train, Y_train)
        score = clf.score(X_test, y_test)
        print("{} accuracy {}".format(classifier, score))
        predictions = clf.predict(X_test)
            
    predictions_list.append(predictions)

#meta_clf = RandomForestClassifier()
#meta_clf = DummyClassifier(strategy="most_frequent")
meta_clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, C = 0.1)
#meta_cld = SVC(kernel='linear', C=1.0)
    
meta_clf.fit(np.stack(predictions_list).transpose(), y_test)


LR accuracy 0.8818583440216616


LogisticRegression(C=0.1, max_iter=1000, multi_class='multinomial')

In [82]:
X_test = vectoriser.transform(test_text.review)

predictions_list = []

for classifier in list(classifiers.keys()):

    clf = classifiers[classifier]

    predictions = clf.predict(X_test)

    predictions_list.append(predictions)
    
predictions = meta_clf.predict(np.stack(predictions_list).transpose())

predictions



array([5, 5, 1, ..., 5, 5, 5], dtype=int64)

In [83]:
predictionsdf = pd.DataFrame(list(enumerate(Y_pred, start = 1)))
predictionsdf = predictionsdf.rename(columns = {0: 'instance_id', 1: 'rating'})
predictionsdf.to_csv("preds2.csv", index = False)
predictionsdf

Unnamed: 0,instance_id,rating
0,1,5
1,2,5
2,3,1
3,4,3
4,5,5
5,6,5
6,7,5
7,8,5
8,9,5
9,10,5
