# Reading in data and imports

In [1]:
# Need these
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from scipy.sparse import hstack


#Natural language toolkit. Download if not installed already
import nltk
from nltk import WordNetLemmatizer
#nltk.download('stopwords')
#nltk.download('wordnet')

# For splitting by punctuation and using regex
import re
import string

# Useful
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# Potentially used Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import LinearSVC, SVC

# Evaluation and feature selection tools
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_selection import chi2, mutual_info_classif

In [2]:
# Test not useful right now
test_meta = pd.read_csv("review_meta_test.csv", sep=",", names=["label", "body_text"])
test_text = pd.read_csv("review_meta_train.csv", sep=',', names=["label", "body_text"])

# Also not useful right now
train_meta = pd.read_csv("review_meta_train.csv", sep=",")
train_text = pd.read_csv("review_text_train.csv", sep=",")

# Ids are not predictive. Consider adding in the date?
train_meta = train_meta.drop(["date", "review_id", "reviewer_id", "business_id"], axis = 1)

# Combine the meta features and text. Move class to end
train_data = pd.concat([train_meta, train_text], axis=1)
class_col = train_data.pop("rating")
train_data["rating"] = class_col
train_data.head()

Unnamed: 0,vote_funny,vote_cool,vote_useful,review,rating
0,0,1,3,dear longman & eagle.......you've left me no c...,1
1,0,0,0,Delish. The hubby and I wanted to do brunch on...,5
2,1,0,1,"yep, I've giving Yolk 5 stars. It's just reall...",5
3,17,3,3,"Meat, meat, meat. It's meat-tastic. So much me...",3
4,0,0,0,I caught up with the law school girls on a Sat...,3


# Preprocessing the review text. Extension after building models on vectors provided
We want to remove punctuation and numbers, tokenise every review's words, lemmatise the words, then remove all the words in the stopwords list.


In [3]:
# This code will take extremely long to execute. Might be useful later if we're generating our own features.
# Make sure to run once and export to csv
# Just use the vectors provided for now

punct = string.punctuation
stopwords = nltk.corpus.stopwords.words("english")
wordnet = WordNetLemmatizer()

# Preprocesses each review in the dataframe
def clean_review(review):
    # Reassign the string after every change
    
    # Remove puncutation and numbers
    no_punct = ''.join(char for char in review if char not in punct and not char.isdigit())
    
    # Tokenize into words
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    words = tokenizer.tokenize(no_punct)
    
    # Remove stopwords and lemmatize, move to lower case
    no_stopwords = [wordnet.lemmatize(word).lower() for word in words if word not in stopwords]
    
    return no_stopwords

In [None]:
# Clean each review in the dataframe? Takes ~10 mins for all reviews
train_data.loc[:1,"review"] = train_data.review.apply(lambda x : clean_review(x))
train_data.head()

# Using already preprocessed vectors for the reviews

In [None]:
# OR we can just use the preprocessed text data

# countvec
import pickle
vocab = pickle.load(open("review_text_features_countvec/train_countvectorizer.pkl", "rb"))
vocab_dict = vocab.vocabulary_
sparse_matrix_train = scipy.sparse.load_npz("review_text_features_countvec/review_text_train_vec.npz")
sparse_matrix_test = scipy.sparse.load_npz("review_text_features_countvec/review_text_test_vec.npz")

# doc2vec 50, 100, 200 features vector for training
d2v_50_train = pd.read_csv(r"review_text_features_doc2vec50/review_text_train_doc2vec50.csv", index_col = False, delimiter = ",", header=None)
d2v_100_train = pd.read_csv(r"review_text_features_doc2vec100/review_text_train_doc2vec100.csv", index_col = False, delimiter = ",", header=None)
d2v_200_train = pd.read_csv(r"review_text_features_doc2vec200/review_text_train_doc2vec200.csv", index_col = False, delimiter = ",", header=None)

# doc2vec 50, 100, 200 features vector for testing
d2v_50_test = pd.read_csv(r"review_text_features_doc2vec50/review_text_test_doc2vec50.csv", index_col = False, delimiter = ",", header=None)
d2v_100_test = pd.read_csv(r"review_text_features_doc2vec100/review_text_test_doc2vec100.csv", index_col = False, delimiter = ",", header=None)
d2v_200_test = pd.read_csv(r"review_text_features_doc2vec200/review_text_test_doc2vec200.csv", index_col = False, delimiter = ",", header=None)

# Split data and form train and validation sets


In [None]:
# Here, choose which predefined and given vector to use to represent the text data.
# Text here is already processed

# Function to combine meta features (optional?)  
def preprocess(type, randomstate = 12587):
    # For each of the doc2vec vectors, concat with meta features
    
    if type == "50":
        features = d2v_50_train
        train = pd.concat([features.reset_index(), train_meta.reset_index()], axis = 1)
        
    elif type == "100" :
        features = d2v_100_train
        train = pd.concat([features.reset_index(), train_meta.reset_index()], axis = 1)
        
    elif type == "200" :
        features = d2v_200_train
        train = pd.concat([features.reset_index(), train_meta.reset_index()], axis = 1)
    
    # If we chose a doc2vec vector, split and return the training and validation sets
    if type != "count":
        # Above concat method causes duplicating index column
        train = train.loc[:,~train.columns.duplicated()]
        
        X_train, X_vali, Y_train, Y_vali = train_test_split(train[train.columns[:-1]],
                                                        train["rating"], test_size=0.30, random_state=randomstate)
        return X_train, X_vali, Y_train, Y_vali
    
    # Otherwise do something with the sparse matrix
    elif type == "count":
        X_train, X_vali, Y_train, Y_vali = train_test_split(sparse_matrix_train, 
                                                            train_data["rating"], test_size=0.30, random_state=randomstate)
        return X_train, X_vali, Y_train, Y_vali



# Fit and train models and Crudely evaluate

In [None]:
# Crude accuracy score and confusion matrix of predictions with % correct. Matrix is "Predicted label x for an instance
# with Class label y"

def evaluate(truthlist, predictions):
    # First calculate a crude accuracy score
    correct = 0;
    wrong = 0;
    for i in range(0,len(truthlist)):
        if(truthlist[i] == predictions[i]):
            correct += 1
        else:
            wrong += 1;
    print("The accuracy of the predictions is: {:.5f}\n".format(correct/(correct + wrong)))
        
    # Now construct a confusion matrix of each attribute
    truthSeries = pd.Series(truthlist, name = "Truths")
    predictionSeries = pd.Series(predictions, name = "Predictions")
    
    # Now normalise the confusion matrix so its a percentage of classification performance
    confusionDf = pd.crosstab(truthSeries, predictionSeries, rownames=["Truths"], colnames=["Predicted"], margins=False)
    confusionDfNormalised = confusionDf / confusionDf.sum(axis=0)
    print("Confusion Matrix of Correctly Labeled Classes %'s\n")
    print(confusionDfNormalised)
    print("\n\n")

    
    return

import csv

# Also need a function to export the predictions to a CSV
def export(instanceid, predictions):
    f = open("output.csv", "w", newline='')
    writer = csv.writer(f)
    writer.writerow(["Instance_id", "rating"])
    
    for i, j in zip(instanceid, predictions):
        f.write(str(i) + "," + str(j))
        f.write("\n")
    f.close()

In [None]:
category = ["1", "3", "5"]
datasets = ["50", "100", "200", "count"] # don't include count for now

# Binomial Naive Bayes
Binomial NB would only work for discrete data. Possibly discretise doc2vec vectors?

In [None]:
print('\033[1m' + f"Trained with count vectoriser" '\033[0m')
X_train, X_vali, Y_train, Y_vali = preprocess("count")
clf = BernoulliNB().fit(X_train, Y_train)
Y_pred = clf.predict(X_vali)
evaluate(Y_pred, Y_vali.values)

# Gaussian Naive bayes
Probably use the doc2vec continuous vectors for this

In [None]:
for i in datasets:
    if(i != "count"):
        print('\033[1m' + f"Trained with doc2vec_{i} features" '\033[0m')
        X_train, X_vali, Y_train, Y_vali = preprocess(i)
        clf = GaussianNB().fit(X_train, Y_train)
        Y_pred = clf.predict(X_vali)
        evaluate(Y_pred, Y_vali.values)
    else:
        print("Haven't used count vector for GNB")

# Multinomial Naive bayes
Same with binomial Nb. Use the countvector, or discretise the continuous data in doc2vec

In [None]:
print('\033[1m' + f"Trained with count vectoriser" '\033[0m')
X_train, X_vali, Y_train, Y_vali = preprocess("count")
clf = MultinomialNB().fit(X_train, Y_train)
Y_pred = clf.predict(X_vali)
evaluate(Y_pred, Y_vali.values)


# Logistic Regression
works and runs for all of the vectors provided

In [None]:
for i in datasets:
    print('\033[1m' + f"Trained with doc2vec_{i} features" '\033[0m')
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

# Linear SVC

In [None]:
# Each SVM Takes around 15-20 minutes to run...
for i in datasets:
    print(f"Training using doc2vec_{i} data set")
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = LinearSVC(max_iter=5000).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

# SVC with Linear kernel

In [None]:
C = 1.0

for i in datasets:
    print(f"Training using doc2vec_{i} data set")
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = SVC(kernel='linear', C=C).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

# SVC with RBF Kernel

In [None]:
for i in datasets:
    print(f"Training using doc2vec_{i} data set")
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = SVC(kernel='rbf', gamma=0.7, C=C).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

# SVC with poly kernel

In [None]:
for i in datasets:
    print(f"Training using doc2vec_{i} data set")
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = SVC(kernel='poly', degree=3, gamma='auto', C=1.0).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

# Random forest Classifier

In [None]:
for i in datasets:
    print(f"Training using doc2vec_{i} data set")
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = RandomForestClassifier(n_estimators=100).fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)


# Decision Tree Classifier

In [None]:
for i in datasets:
    print(f"Training using doc2vec_{i} data set")
    X_train, X_vali, Y_train, Y_vali = preprocess(i)
    clf = DecisionTreeClassifier(max_depth=None, criterion="entropy").fit(X_train, Y_train)
    Y_pred = clf.predict(X_vali)
    evaluate(Y_pred, Y_vali.values)

# Logistic regression using feature selection via chi^2, MI
Marginally increases performance. Possibly due to logisitc regression already weighting good features

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

X_train, X_vali, Y_train, Y_vali = preprocess("count")

x2 = SelectKBest(chi2, k=2000)

X_train_x2 = x2.fit_transform(X_train, Y_train)
X_vali_x2 = x2.transform(X_vali)

print('\033[1m' + f"Trained with COUNT features" '\033[0m')
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train_x2, Y_train)
Y_pred = clf.predict(X_vali_x2)
evaluate(Y_pred, Y_vali.values)

In [None]:
X_train, X_vali, Y_train, Y_vali = preprocess("200")

mi = SelectKBest(score_func=mutual_info_classif, k=80)
mi.fit(X_train, Y_train)
X_train_mi = mi.transform(X_train)
X_test_mi = mi.transform(X_vali)

print('\033[1m' + f"Trained with COUNT features" '\033[0m')
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train_mi, Y_train)
Y_pred = clf.predict(X_test_mi)
evaluate(Y_pred, Y_vali.values)

# Random forest with feature selection, via chi^2, MI
Increases performance by 2-4% with some k settings on the doc2vecs and count

In [None]:

print(f"Training using doc2vec_count data set")
X_train, X_vali, Y_train, Y_vali = preprocess("count")
    
x2 = SelectKBest(chi2, k=500)
X_train_x2 = x2.fit_transform(X_train, Y_train)
X_vali_x2 = x2.transform(X_vali)
    
clf = RandomForestClassifier(n_estimators=100).fit(X_train_x2, Y_train)
Y_pred = clf.predict(X_vali_x2)
evaluate(Y_pred, Y_vali.values)

In [None]:
print(f"Training using doc2vec_50 data set")
X_train, X_vali, Y_train, Y_vali = preprocess("50")

mi = SelectKBest(score_func=mutual_info_classif, k=25)
mi.fit(X_train, Y_train)
X_train_mi = mi.transform(X_train)
X_test_mi = mi.transform(X_vali)

clf = RandomForestClassifier(n_estimators=100).fit(X_train_mi, Y_train)
Y_pred = clf.predict(X_test_mi)
evaluate(Y_pred, Y_vali.values)

# Using SVMs with a reduced dataset, using feature selection chi2, MI. 
Lets us run SVM classifiers in reasonable time by using a reduced feature set


In [None]:
# Can reduce the time?
print(f"Training using doc2vec_100 data set")
X_train, X_vali, Y_train, Y_vali = preprocess("100")
    
mi = SelectKBest(score_func=mutual_info_classif, k=40)
mi.fit(X_train, Y_train)
X_train_mi = mi.transform(X_train)
X_test_mi = mi.transform(X_vali)
    
clf = LinearSVC(max_iter=5000).fit(X_train_mi, Y_train)
Y_pred = clf.predict(X_test_mi)
evaluate(Y_pred, Y_vali.values)

In [None]:
print(f"Training using doc2vec_200 data set")
X_train, X_vali, Y_train, Y_vali = preprocess("200")

mi = SelectKBest(score_func=mutual_info_classif, k=50)
mi.fit(X_train, Y_train)
X_train_mi = mi.transform(X_train)
X_test_mi = mi.transform(X_vali)

clf = SVC(kernel='poly', degree=3, gamma='auto', C=1.0).fit(X_train_mi, Y_train)
Y_pred = clf.predict(X_test_mi)
evaluate(Y_pred, Y_vali.values)

In [None]:
print(f"Training using doc2vec_200 data set")
X_train, X_vali, Y_train, Y_vali = preprocess("200")

mi = SelectKBest(score_func=mutual_info_classif, k=50)
mi.fit(X_train, Y_train)
X_train_mi = mi.transform(X_train)
X_test_mi = mi.transform(X_vali)

clf = SVC(kernel='rbf', degree=3, gamma='auto', C=1.0).fit(X_train_mi, Y_train)
Y_pred = clf.predict(X_test_mi)
evaluate(Y_pred, Y_vali.values)

# Try use some boosting models now

# XGBOOST

In [None]:
import sys
!{sys.executable} -m pip install xgboost

In [None]:
from xgboost import XGBClassifier

In [None]:

# Each SVM Takes around 15-20 minutes to run...
for i in datasets:
    if(i != "count"):
        print(f"Training using doc2vec_{i} with XGBoost")
        X_train, X_vali, Y_train, Y_vali = preprocess(i)
        clf = XGBClassifier().fit(X_train, Y_train)
        Y_pred = clf.predict(X_vali)
        evaluate(Y_pred, Y_vali.values)

    


In [None]:
print(f"Training using doc2vec_count with XGBoost")
X_train, X_vali, Y_train, Y_vali = preprocess("count")
clf = XGBClassifier().fit(X_train, Y_train)
Y_pred = clf.predict(X_vali)
evaluate(Y_pred, Y_vali.values)

# Try stacking?

In [None]:
from sklearn.metrics import accuracy_score

np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    


classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
                MultinomialNB(),
                #XGBClassifier(),
                #SVC(kernel='linear', C=1.0, probability=True),
                RandomForestClassifier(n_estimators=100)]

titles = ['Logistic Regression',
          'Multinomial NB',
          #'XGBoost',
          #'Linear SVC kernel',
          'Forest']

meta_classifier = LogisticRegression(solver='lbfgs')
stacker = StackingClassifier(classifiers, meta_classifier)

In [None]:
X_train, X_vali, Y_train, Y_vali = preprocess("100")

for title,clf in zip(titles,classifiers):
    clf.fit(X_train, Y_train)
    print(title, "Accuracy:",clf.score(X_vali, Y_vali))
    
stacker.fit(X_train, Y_train)
print('\nStacker Accuracy:', stacker.score(X_vali, Y_vali))

# Random stacking sets

In [None]:
classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
                MultinomialNB(),
                XGBClassifier(),
                SVC(kernel='linear', C=1.0, probability=True),
                SVC(kernel='rbf', gamma=0.7, C=1.0),
                SVC(kernel='poly', degree=3, gamma='auto', C=1.0),
                RandomForestClassifier(n_estimators=100)]

titles = ['Logistic Regression',
          'Multinomial NB',
          'XGBoost',
          'Linear SVC kernel',
          'RBF SVC kernel',
          'Poly SVC kernel',
          'Forest']

meta_classifier = LogisticRegression(solver='lbfgs')
stacker = StackingClassifier(classifiers, meta_classifier)


X_train, X_vali, Y_train, Y_vali = preprocess("count")

for title,clf in zip(titles,classifiers):
    clf.fit(X_train, Y_train)
    print(title, "Accuracy:",clf.score(X_vali, Y_vali))
    
stacker.fit(X_train, Y_train)
print('\nStacker Accuracy:', stacker.score(X_vali, Y_vali))

In [None]:
classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
                MultinomialNB(),
                XGBClassifier(),
                #SVC(kernel='linear', C=1.0, probability=True),
                #SVC(kernel='rbf', gamma=0.7, C=C, probability=True),
                SVC(kernel='poly', degree=3, gamma='auto', C=1.0, probability=True),
                RandomForestClassifier(n_estimators=100)]

titles = ['Logistic Regression',
          'Multinomial NB',
          'XGBoost',
          #'Linear SVC kernel',
          #'RBF SVC kernel',
          'Poly SVC kernel',
          'Forest']

meta_classifier = LogisticRegression(solver='lbfgs')
stacker = StackingClassifier(classifiers, meta_classifier)


X_train, X_vali, Y_train, Y_vali = preprocess("count")

for title,clf in zip(titles,classifiers):
    clf.fit(X_train, Y_train)
    print(title, "Accuracy:",clf.score(X_vali, Y_vali))
    
stacker.fit(X_train, Y_train)
print('\nStacker Accuracy:', stacker.score(X_vali, Y_vali))

In [None]:
classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
                MultinomialNB(),
                XGBClassifier()
                #SVC(kernel='linear', C=1.0, probability=True),
                #SVC(kernel='rbf', gamma=0.7, C=C),
                #SVC(kernel='poly', degree=3, gamma='auto', C=1.0),
                #RandomForestClassifier(n_estimators=100)]
              ]

titles = ['Logistic Regression',
          'Multinomial NB',
          'XGBoost'
          #'Linear SVC kernel',
          #'RBF SVC kernel',
          #'Poly SVC kernel',
          #'Forest'
         ]

meta_classifier = LogisticRegression(solver='lbfgs')
stacker = StackingClassifier(classifiers, meta_classifier)


X_train, X_vali, Y_train, Y_vali = preprocess("count")

for title,clf in zip(titles,classifiers):
    clf.fit(X_train, Y_train)
    print(title, "Accuracy:",clf.score(X_vali, Y_vali))
    
stacker.fit(X_train, Y_train)
print('\nStacker Accuracy:', stacker.score(X_vali, Y_vali))

In [None]:
classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
                #MultinomialNB(),
                XGBClassifier(),
                #SVC(kernel='linear', C=1.0, probability=True),
                SVC(kernel='rbf', gamma=0.7, C=1.0, probability=True),
                #SVC(kernel='poly', degree=3, gamma='auto', C=1.0),
                RandomForestClassifier(n_estimators=100)]

titles = ['Logistic Regression',
          #'Multinomial NB',
          'XGBoost',
          #'Linear SVC kernel',
          'RBF SVC kernel',
          #'Poly SVC kernel',
          'Forest']

meta_classifier = LogisticRegression(solver='lbfgs')
stacker = StackingClassifier(classifiers, meta_classifier)


X_train, X_vali, Y_train, Y_vali = preprocess("count")

for title,clf in zip(titles,classifiers):
    clf.fit(X_train, Y_train)
    print(title, "Accuracy:",clf.score(X_vali, Y_vali))
    
stacker.fit(X_train, Y_train)
print('\nStacker Accuracy:', stacker.score(X_vali, Y_vali))

In [None]:
classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
                MultinomialNB(),
                XGBClassifier(),
                SVC(kernel='linear', C=1.0, probability=True),
                #SVC(kernel='rbf', gamma=0.7, C=1.0, probability=True),
                #SVC(kernel='poly', degree=3, gamma='auto', C=1.0, probability=True),
                RandomForestClassifier(n_estimators=100)]

titles = ['Logistic Regression',
          'Multinomial NB',
          'XGBoost',
          'Linear SVC kernel',
          #'RBF SVC kernel',
          #'Poly SVC kernel',
          'Forest']

meta_classifier = LogisticRegression(solver='lbfgs')
stacker = StackingClassifier(classifiers, meta_classifier)


X_train, X_vali, Y_train, Y_vali = preprocess("count")

for title,clf in zip(titles,classifiers):
    clf.fit(X_train, Y_train)
    print(title, "Accuracy:",clf.score(X_vali, Y_vali))
    
stacker.fit(X_train, Y_train)
print('\nStacker Accuracy:', stacker.score(X_vali, Y_vali))

In [None]:
classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
                #MultinomialNB(),
                XGBClassifier(),
                #SVC(kernel='linear', C=1.0, probability=True),
                #SVC(kernel='rbf', gamma=0.7, C=C),
                SVC(kernel='poly', degree=3, gamma='auto', C=1.0, probability=True)
                #RandomForestClassifier(n_estimators=100)]
              ]

titles = ['Logistic Regression',
          #'Multinomial NB',
          'XGBoost',
          #'Linear SVC kernel',
          #'RBF SVC kernel',
          'Poly SVC kernel'
          #'Forest']
         ]

meta_classifier = LogisticRegression(solver='lbfgs')
stacker = StackingClassifier(classifiers, meta_classifier)


X_train, X_vali, Y_train, Y_vali = preprocess("count")

for title,clf in zip(titles,classifiers):
    clf.fit(X_train, Y_train)
    print(title, "Accuracy:",clf.score(X_vali, Y_vali))
    
stacker.fit(X_train, Y_train)
print('\nStacker Accuracy:', stacker.score(X_vali, Y_vali))

In [None]:
classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
                MultinomialNB(),
                #XGBClassifier(),
                #SVC(kernel='linear', C=1.0, probability=True),
                SVC(kernel='rbf', gamma=0.7, C=C),
                #SVC(kernel='poly', degree=3, gamma='auto', C=1.0),
                RandomForestClassifier(n_estimators=100)]

titles = ['Logistic Regression',
          'Multinomial NB',
          #'XGBoost',
          #'Linear SVC kernel',
          'RBF SVC kernel',
          #'Poly SVC kernel',
          'Forest']

meta_classifier = LogisticRegression(solver='lbfgs')
stacker = StackingClassifier(classifiers, meta_classifier)


X_train, X_vali, Y_train, Y_vali = preprocess("count")

for title,clf in zip(titles,classifiers):
    clf.fit(X_train, Y_train)
    print(title, "Accuracy:",clf.score(X_vali, Y_vali))
    
stacker.fit(X_train, Y_train)
print('\nStacker Accuracy:', stacker.score(X_vali, Y_vali))

In [None]:
classifiers = [LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
                #MultinomialNB(),
                XGBClassifier(),
                #SVC(kernel='linear', C=1.0, probability=True),
                #SVC(kernel='rbf', gamma=0.7, C=1.0, probability=True),
                SVC(kernel='poly', degree=3, gamma='auto', C=1.0),
                RandomForestClassifier(n_estimators=100)]

titles = ['Logistic Regression',
          #'Multinomial NB',
          'XGBoost',
          #'Linear SVC kernel',
          #'RBF SVC kernel',
          'Poly SVC kernel',
          'Forest']

meta_classifier = LogisticRegression(solver='lbfgs')
stacker = StackingClassifier(classifiers, meta_classifier)

for i in datasets:
    X_train, X_vali, Y_train, Y_vali = preprocess(i)

    for title,clf in zip(titles,classifiers):
        clf.fit(X_train, Y_train)
        print(title, "Accuracy:",clf.score(X_vali, Y_vali))

    stacker.fit(X_train, Y_train)
    print('\nStacker Accuracy:', stacker.score(X_vali, Y_vali))