In [4]:
import json
import string
import pandas as pd
import numpy as np

In [5]:
# Read Summarisation Model results from json file.

with open("../data/Model_Results/summarisation_results.jsonl", 'r') as json_file:
    json_list = list(json_file)

    
summarisationResults = {}
for json_str in json_list:
    
    result = json.loads(json_str)
    summaryList = sorted(result["sentence_scores"], key=lambda x:x[1], reverse=True)[:4]
    summary = ""
    for sentence in summaryList:
        summary += sentence[0] + ' '
    
    summarisationResults[result["id"]] = summary

In [6]:
summarisationData = pd.DataFrame(data=summarisationResults, index=[0])

In [7]:
summarisationDataDict = {}

for json_id in summarisationData.columns:
    
    features = {}
    
    features["summary"] = summarisationData[json_id].values[0]
    
    summarisationDataDict[json_id] = features

In [8]:
summarisationData = pd.DataFrame().from_dict(data=summarisationDataDict)

display(summarisationData)

Unnamed: 0,11972.json,11685.json,11096.json,5209.json,9524.json,5962.json,7070.json,1046.json,12849.json,13270.json,...,11576.json,3461.json,9464.json,10227.json,11707.json,3425.json,2977.json,294.json,3580.json,8384.json
summary,"""Building a wall"" on the border ""will take lit...","Those numbers show that as of October 2015, th...","If he did, he would have known that Senator Mc...","""…She supports taking $500 billion away from M...","Scott Walker helped run a ""criminal scheme"" to...",The campaign accurately quoted a figure whose ...,"So $30-31 million per year would, in fact, be ...",The Obama administration has emphasized many o...,"""It matters who’s leading the country, and it ...","Pence described the donors as major. ""The nati...",...,We’re not sure Sanders made that entirely clea...,"The $20 million designated for Cuba ""focuses o...",There also are no Asian or Pacific Islander Re...,"""The United States is in the longest stretch o...","""Secretary Clinton changes her position on thi...","Under the header ""New jobs created by the Stre...","They said low pay, increased work demands and ...","• Comstock, an adviser and frequent spokeswoma...","""House Republicans under Paul Ryan's leadershi...","""I will work in a bipartisan way to get it don..."


In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [10]:
def addSummary(data, summaries):
    
    for index, row in data.iterrows():
        if row["json_id"] in summaries.columns:
            row["summary"] = summaries[row["json_id"]].values[0]

In [11]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/vassilis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vassilis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
def cleanAndStemm(sentence):
    
    stop_words = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()

    sentenceList = []
    for token in sentence.split(' '):
        
        token = token.lower()
        token = token.translate(str.maketrans('', '', string.punctuation))
        
        if token in stop_words:
            continue
        
        token = wordnet_lemmatizer.lemmatize(token)
        
        sentenceList.append(token)
        
    newSentence = ' '.join(sentenceList)
    return newSentence

In [13]:
def TenFoldValidation_BowTfIdf(summarisationData, classifier):
    
    accuracyResults = np.zeros([10])
        
    for counter in range(10):
        x_trainDf = pd.read_csv ('../data/Train_Eval_Test_Data/Iteration' + str(counter + 1) + '/train' + str(counter + 1) + '.tsv', sep='\t')
        x_trainDf.columns = ["json_id", "claim", "justification_label"]

        x_testDf = pd.read_csv ('../data/Train_Eval_Test_Data/Iteration' + str(counter + 1) + '/test' + str(counter + 1) + '.tsv', sep='\t')
        x_testDf.columns = ["json_id", "claim", "justification_label"]
        
        x_trainDf["summary"] = ""
        x_testDf["summary"]  = ""
        
        addSummary(x_trainDf, summarisationData)
        addSummary(x_testDf, summarisationData)
        
        y_trainDf = pd.DataFrame(data=x_trainDf["justification_label"])
        y_testDf  = pd.DataFrame(data=x_testDf["justification_label"])
        
        trainClaimSumm  = []
        yTrainLabels   = y_trainDf['justification_label'].tolist()

        testClaimSumm   = []
        yTestLabels    = y_testDf['justification_label'].tolist()

        for index, row in x_trainDf.iterrows():
            trainClaimSumm.append(cleanAndStemm(row['claim']) + cleanAndStemm(row['summary']))

        for index, row in x_testDf.iterrows():
            testClaimSumm.append(cleanAndStemm(row['claim']) + cleanAndStemm(row['summary']))

            
        # Convert words to number using Bag of Words (Bow)
        vectorizer = CountVectorizer(max_features=30)
        vectorizedTrain = vectorizer.fit_transform(trainClaimSumm).toarray()
        vectorizedTest  = vectorizer.fit_transform(testClaimSumm).toarray()

        # Convert Bow values according to TfIdf
        tfidfconverter = TfidfTransformer()
        XTrain = tfidfconverter.fit_transform(vectorizedTrain).toarray()
        XTest  = tfidfconverter.fit_transform(vectorizedTest).toarray()

        classifier.fit(XTrain,yTrainLabels)

        yPredLabels = classifier.predict(XTest)
        
        accuracyScore = accuracy_score(yTestLabels,yPredLabels)
        accuracyResults[counter] = accuracyScore

        # ConfusionMatrixDisplay.from_predictions(yTestLabels, yPredLabels, cmap='Greens')
#         print(classification_report(yTestLabels,yPredLabels))
        print("Iteration:",counter+1," Accuracy Score: ",accuracyScore)
    
    return accuracyResults, np.mean(accuracyResults)

In [20]:
def HyperParameterTuningBowTfIdf(summarisationData, classifier):

    bestAccuracyVal  = 0.0
    bestAccuracyTest = 0.0
    bestClassifier = None

    for counter in range(1):
        print("Fold ", counter + 1, " of tuning.")
        x_trainDf = pd.read_csv ('../data/Train_Eval_Test_Data/Iteration' + str(counter + 1) + '/train' + str(counter + 1) + '.tsv', sep='\t')
        x_trainDf.columns = ["json_id", "claim", "justification_label"]

        x_valDf = pd.read_csv ('../data/Train_Eval_Test_Data/Iteration' + str(counter + 1) + '/val' + str(counter + 1) + '.tsv', sep='\t')
        x_valDf.columns = ["json_id", "claim", "justification_label"]
        
        x_trainDf["summary"] = ""
        x_valDf["summary"] = ""
        
        addSummary(x_trainDf, summarisationData)
        addSummary(x_valDf, summarisationData)
        
        y_trainDf = pd.DataFrame(data=x_trainDf["justification_label"])
        y_valDf = pd.DataFrame(data=x_valDf["justification_label"])
        
        trainClaimSumm  = []
        yTrainLabels   = y_trainDf['justification_label'].tolist()

        valClaimSumm  = []
        yValLabels   = y_valDf['justification_label'].tolist()

        for index, row in x_trainDf.iterrows():
            trainClaimSumm.append(cleanAndStemm(row['claim']) + cleanAndStemm(row['summary']))

        for index, row in x_valDf.iterrows():
            valClaimSumm.append(cleanAndStemm(row['claim']) + cleanAndStemm(row['summary']))

            
        # Convert words to number using Bag of Words (Bow)
        vectorizer = CountVectorizer(max_features=30)
        vectorizedTrain = vectorizer.fit_transform(trainClaimSumm).toarray()
        vectorizedVal = vectorizer.fit_transform(valClaimSumm).toarray()

        # Convert Bow values according to TfIdf
        tfidfconverter = TfidfTransformer()
        XTrain = tfidfconverter.fit_transform(vectorizedTrain).toarray()
        XVal = tfidfconverter.fit_transform(vectorizedVal).toarray()

#         X_train, X_test, y_train, y_test = train_test_split(XTrain, yTrainLabels,stratify=yTrainLabels, test_size=0.15)

        classifier.fit(XTrain,yTrainLabels)

#         yPredTest = classifier.best_estimator_.predict(X_test)

        yPredVal = classifier.best_estimator_.predict(XVal)
        
#         accuracyTest = accuracy_score(y_test,yPredTest)
        accuracyVal  = accuracy_score(yValLabels, yPredVal)

        if accuracyVal >= bestAccuracyVal:
#             bestAccuracyTest = accuracyTest
            bestAccuracyVal = accuracyVal
            bestModel = classifier.best_estimator_

    return bestModel, bestAccuracyTest, bestAccuracyVal

In [15]:
def TenFoldValidation_Word2Vec(summarisationData, classifier):
    
    accuracyResults = np.zeros([10])

    for counter in range(10):
        x_trainDf = pd.read_csv ('../data/Train_Eval_Test_Data/Iteration' + str(counter + 1) + '/train' + str(counter + 1) + '.tsv', sep='\t')
        x_trainDf.columns = ["json_id", "claim", "justification_label"]

        x_testDf = pd.read_csv ('../data/Train_Eval_Test_Data/Iteration' + str(counter + 1) + '/test' + str(counter + 1) + '.tsv', sep='\t')
        x_testDf.columns = ["json_id", "claim", "justification_label"]
        
        x_trainDf["summary"] = ""
        x_testDf["summary"]  = ""
        
        addSummary(x_trainDf, summarisationData)
        addSummary(x_testDf, summarisationData)
        
        y_trainDf = pd.DataFrame(data=x_trainDf["justification_label"])
        y_testDf  = pd.DataFrame(data=x_testDf["justification_label"])
        
        yTrainLabels   = y_trainDf['justification_label'].tolist()
        yTestLabels    = y_testDf['justification_label'].tolist()
        
        # Transform the data into a list of lists for the Word2Vec model
        w2vTrainClaimSum = []
        w2vTestClaimSum  = []

        for index, row in x_trainDf.iterrows():
            w2vTrainClaimSum.append((cleanAndStemm(row['claim']) + cleanAndStemm(row['summary'])).split(" "))

        for index, row in x_testDf.iterrows():
            w2vTestClaimSum.append((cleanAndStemm(row['claim']) + cleanAndStemm(row['summary'])).split(" "))

            
        # Vectorize using Word2Vec

        # min_count = 3 & size = 40 & window=8 & sg=1 

        model = Word2Vec(sentences=w2vTrainClaimSum, min_count=3,size=40,workers=4, window=8, sg=1)

        words = set(model.wv.index2word)
        X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in words])
                                 for ls in w2vTrainClaimSum])
        X_test_vect = np.array([np.array([model.wv[i] for i in ls if i in words])
                                 for ls in w2vTestClaimSum])

        X_train_vect_avg = []
        for v in X_train_vect:
            if v.size:
                X_train_vect_avg.append(v.mean(axis=0))
            else:
                X_train_vect_avg.append(np.zeros(100, dtype=float))

        X_test_vect_avg = []
        for v in X_test_vect:
            if v.size:
                X_test_vect_avg.append(v.mean(axis=0))
            else:
                X_test_vect_avg.append(np.zeros(100, dtype=float))
        
        classifier.fit(X_train_vect_avg,yTrainLabels)

        yPredLabels = classifier.predict(X_test_vect_avg)

#         ConfusionMatrixDisplay.from_predictions(yTestLabels, yPredLabels, cmap='Greens')
#         print(classification_report(yTestLabels,yPredLabels))

        accuracyScore = accuracy_score(yTestLabels,yPredLabels)
        accuracyResults[counter] = accuracyScore
        
        print("Iteration:",counter+1," Accuracy Score: ",accuracyScore)
        
    return accuracyResults, np.mean(accuracyResults)

In [16]:
def HyperParameterTuningWord2Vec(summarisationData, classifier):

    bestAccuracyVal  = 0.0
    bestAccuracyTest = 0.0
    bestClassifier = None

    for counter in range(10):
        print("Fold ", counter + 1, " of tuning.")
        x_trainDf = pd.read_csv ('../data/Train_Eval_Test_Data/Iteration' + str(counter + 1) + '/train' + str(counter + 1) + '.tsv', sep='\t')
        x_trainDf.columns = ["json_id", "claim", "justification_label"]

        x_valDf = pd.read_csv ('../data/Train_Eval_Test_Data/Iteration' + str(counter + 1) + '/val' + str(counter + 1) + '.tsv', sep='\t')
        x_valDf.columns = ["json_id", "claim", "justification_label"]
        
        x_trainDf["summary"] = ""
        x_valDf["summary"]  = ""
        
        addSummary(x_trainDf, summarisationData)
        addSummary(x_valDf, summarisationData)
        
        y_trainDf = pd.DataFrame(data=x_trainDf["justification_label"])
        y_valDf  = pd.DataFrame(data=x_valDf["justification_label"])
        
        yTrainLabels   = y_trainDf['justification_label'].tolist()
        yValLabels    = y_valDf['justification_label'].tolist()
        
        # Transform the data into a list of lists for the Word2Vec model
        w2vTrainClaimSum = []
        w2vValClaimSum  = []

        for index, row in x_trainDf.iterrows():
            w2vTrainClaimSum.append((cleanAndStemm(row['claim']) + cleanAndStemm(row['summary'])).split(" "))

        for index, row in x_valDf.iterrows():
            w2vValClaimSum.append((cleanAndStemm(row['claim']) + cleanAndStemm(row['summary'])).split(" "))

            
        # Vectorize using Word2Vec

        # min_count = 3 & size = 40 & window=8 & sg=1 

        model = Word2Vec(sentences=w2vTrainClaimSum, min_count=3,size=40,workers=4, window=8, sg=1)

        words = set(model.wv.index2word)
        X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in words])
                                 for ls in w2vTrainClaimSum])
        
        
        model = Word2Vec(sentences=w2vValClaimSum, min_count=3,size=40,workers=4, window=8, sg=1)

        words = set(model.wv.index2word)
        X_val_vect = np.array([np.array([model.wv[i] for i in ls if i in words])
                                 for ls in w2vValClaimSum])

        X_train_vect_avg = []
        for v in X_train_vect:
            if v.size:
                X_train_vect_avg.append(v.mean(axis=0))
            else:
                X_train_vect_avg.append(np.zeros(100, dtype=float))

        X_val_vect_avg = []
        for v in X_val_vect:
            if v.size:
                X_val_vect_avg.append(v.mean(axis=0))
            else:
                X_val_vect_avg.append(np.zeros(100, dtype=float))

#         X_train, X_test, y_train, y_test = train_test_split(XTrain, yTrainLabels,stratify=yTrainLabels, test_size=0.15)

        classifier.fit(X_train_vect_avg,yTrainLabels)

#         yPredTest = classifier.best_estimator_.predict(X_test)

        yPredVal = classifier.best_estimator_.predict(X_val_vect_avg)
        
#         accuracyTest = accuracy_score(y_test,yPredTest)
        accuracyVal  = accuracy_score(yValLabels, yPredVal)

        if accuracyVal >= bestAccuracyVal:
#             bestAccuracyTest = accuracyTest
            bestAccuracyVal = accuracyVal
            bestModel = classifier.best_estimator_

    return bestModel, bestAccuracyTest, bestAccuracyVal

In [17]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
# from sklearn.dummy import DummyClassifier

# # Dummy Classifier With Method Stratified

# # Bow - Tf-Idf
# print("\n---- Bow - Tf-Idf Results----\n")
# dummy_clf = DummyClassifier(strategy="stratified")
# accuracyPerIteration, meanAccuracy = TenFoldValidation_BowTfIdf(summarisationData, dummy_clf)
# print("Mean Accuracy: ", meanAccuracy)

# # Word 2 Vec
# print("\n---- Word2Vec Results----\n")
# dummy_clf = DummyClassifier(strategy="stratified")
# accuracyPerIteration, meanAccuracy = TenFoldValidation_Word2Vec(summarisationData, dummy_clf)
# print("Mean Accuracy: ",meanAccuracy)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Evaluation
randForestClass = RandomForestClassifier()
parameters = {
              'criterion':('gini', 'entropy'), 
              'max_features':('auto', 'sqrt', 'log2'),
              'n_estimators':[100, 1000, 1500, 2000, 2500, 3000, 4000, 5000]
              # ,'max_depth': range(1,10)
             }
randForestClassifier = GridSearchCV(randForestClass, parameters, n_jobs = -1, cv=5, scoring='accuracy')
# randForestClassifier.fit(XTrainVal, ytrainValLabels)
# print(randForestClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuningBowTfIdf(summarisationData, randForestClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

# Random Forest Classification Method

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
# randForestClas = RandomForestClassifier(n_estimators=1000, random_state=0)
accuracyArrayX, meanAccuracyX = TenFoldValidation_BowTfIdf(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyX)

# from scipy.stats import wilcoxon
# stat, p = wilcoxon(accuracyArrayX, accuracyArrayY)
# print('Statistics=%.3f, p=%.3f' % (stat, p))

Fold  1  of tuning.


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Evaluation
randForestClass = RandomForestClassifier()
parameters = {
              'criterion':('gini', 'entropy'), 
              'max_features':('auto', 'sqrt', 'log2'),
              'n_estimators':[100, 1000, 1500, 2000, 2500, 3000, 4000, 5000]
             }
randForestClassifier = GridSearchCV(randForestClass, parameters, n_jobs = -1, verbose=20, cv=10, scoring='accuracy')
randForestClassifier.fit(X_trainVal_vect, ytrainValLabels)
print(randForestClassifier.best_estimator_)

# Random Forest Classification Method

# Word2Vec
print("\n---- Word2Vec Results----\n")
# randForestClas = RandomForestClassifier(n_estimators=1000, random_state=0)
accuracyArrayY, meanAccuracyY = TenFoldValidation_Word2Vec(summarisationData, randForestClassifier.best_estimator_)
print("Mean Accuracy: ", meanAccuracyY)

# from scipy.stats import wilcoxon
# stat, p = wilcoxon(accuracyArrayX, accuracyArrayY)
# print('Statistics=%.3f, p=%.3f' % (stat, p))

Fitting 10 folds for each of 48 candidates, totalling 480 fits


ValueError: setting an array element with a sequence.

In [26]:
from sklearn.neighbors import KNeighborsClassifier

# # K-Neighbors Evaluation
kNeighborsClas = KNeighborsClassifier()
parameters = {
              'weights':('uniform', 'distance'), 
              'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute'),
              'n_neighbors':range(1,50)
             }
kNClassifier = GridSearchCV(kNeighborsClas, parameters, n_jobs=-1, cv=10, scoring='accuracy')
# kNClassifier.fit(XTrainVal, ytrainValLabels)
# print(kNClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuningBowTfIdf(summarisationData, kNClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)
# K-Neighbors Classifier

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
# kNeighborsClas = KNeighborsClassifier(n_neighbors=4)
accuracyArrayX, meanAccuracyX = TenFoldValidation_BowTfIdf(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyX)

Fold  1  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.3076923076923077

---- Bow - Tf-Idf Results----

Iteration: 1  Accuracy Score:  0.38461538461538464
Iteration: 2  Accuracy Score:  0.38461538461538464
Iteration: 3  Accuracy Score:  0.3076923076923077
Iteration: 4  Accuracy Score:  0.4166666666666667
Iteration: 5  Accuracy Score:  0.25
Iteration: 6  Accuracy Score:  0.5833333333333334
Iteration: 7  Accuracy Score:  0.25
Iteration: 8  Accuracy Score:  0.4166666666666667
Iteration: 9  Accuracy Score:  0.25
Iteration: 10  Accuracy Score:  0.4166666666666667
Mean Accuracy:  0.366025641025641


In [92]:
from sklearn.neighbors import KNeighborsClassifier

# K-Neighbors Evaluation
kNeighborsClas = KNeighborsClassifier()
parameters = {
              'weights':('uniform', 'distance'), 
              'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute'),
              'n_neighbors':range(1,150)
             }
kNClassifier = GridSearchCV(kNeighborsClas, parameters, n_jobs=-1, cv=5, scoring='accuracy')
# kNClassifier.fit(X_trainVal_vect_avg, ytrainValLabels)
# print(kNClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, kNClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

# K-Neighbors Classifier

print("\n---- Word2Vec Results----\n")
# # Word2Vec
# kNeighborsClas = KNeighborsClassifier(n_neighbors=3)
accuracyArrayY, meanAccuracyY = TenFoldValidation_Word2Vec(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyY)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.4166666666666667

---- Word2Vec Results----

Iteration: 1  Accuracy Score:  0.23076923076923078
Iteration: 2  Accuracy Score:  0.07692307692307693
Iteration: 3  Accuracy Score:  0.38461538461538464
Iteration: 4  Accuracy Score:  0.4166666666666667
Iteration: 5  Accuracy Score:  0.4166666666666667
Iteration: 6  Accuracy Score:  0.25
Iteration: 7  Accuracy Score:  0.3333333333333333
Iteration: 8  Accuracy Score:  0.25
Iteration: 9  Accuracy Score:  0.25
Iteration: 10  Accuracy Score:  0.5833333333333334
Mean Accuracy:  0.31923076923076926


In [18]:
from sklearn.svm import SVC

# SVM Classifier Evaluation
svmClass = SVC()
parameters = {
                'kernel':('linear', 'rbf', 'poly', 'sigmoid'),
                'gamma': ('scale', 'auto'),
                'C': [0.01, 0.1, 1, 2, 10, 100]
             }
svmClassifier = GridSearchCV(svmClass, parameters, n_jobs = -1, cv=10, scoring='accuracy')
# svmClassifier.fit(XTrainVal, ytrainValLabels)
# print(svmClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, svmClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

# SVM Classifier

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
svmClas = SVC()
accuracyArrayX, meanAccuracyX = TenFoldValidation_BowTfIdf(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyX)

NameError: name 'HyperParameterTuning' is not defined

In [48]:
from sklearn.svm import LinearSVC

# Linear SVC Classifier Evaluation
linearSVC = LinearSVC()
parameters = {
                'loss': ('hinge', 'squared_hinge'),
                'multi_class': ('ovr', 'crammer_singer'),
                'C': [0.01, 0.1, 1.0, 2.0, 10.0, 100.0],
                'max_iter': [1000, 2000, 5000, 10000]
             }
linearSVCClassifier = GridSearchCV(linearSVC, parameters, n_jobs = -1, cv=5, scoring='accuracy')
# linearSVCClassifier.fit(XTrainVal, ytrainValLabels)
# print(linearSVCClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, linearSVCClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

# SVM Classifier

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
accuracyArrayX, meanAccuracyX = TenFoldValidation_BowTfIdf(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyX)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.5384615384615384

---- Bow - Tf-Idf Results----

Iteration: 1  Accuracy Score:  0.5384615384615384
Iteration: 2  Accuracy Score:  0.5384615384615384
Iteration: 3  Accuracy Score:  0.23076923076923078
Iteration: 4  Accuracy Score:  0.5
Iteration: 5  Accuracy Score:  0.25
Iteration: 6  Accuracy Score:  0.4166666666666667
Iteration: 7  Accuracy Score:  0.25
Iteration: 8  Accuracy Score:  0.5833333333333334
Iteration: 9  Accuracy Score:  0.4166666666666667
Iteration: 10  Accuracy Score:  0.3333333333333333
Mean Accuracy:  0.40576923076923077


In [96]:
from sklearn.svm import SVC

# SVM Classifier Evaluation
svmClass = SVC()
parameters = {
                'kernel':('linear', 'rbf', 'poly', 'sigmoid'),
                'gamma': ('scale', 'auto'),
                'C': [0.01, 0.1, 1, 2, 10, 100]
             }
svmClassifier = GridSearchCV(svmClass, parameters, n_jobs = -1, cv=10, scoring='accuracy')
# svmClassifier.fit(X_trainVal_vect_avg, ytrainValLabels)
# print(svmClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, svmClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

# SVM Classifier

print("\n---- Word2Vec Results----\n")
# Word2Vec
svmClas = SVC()
accuracyArrayY, meanAccuracyY = TenFoldValidation_Word2Vec(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyY)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.5384615384615384

---- Word2Vec Results----

Iteration: 1  Accuracy Score:  0.46153846153846156
Iteration: 2  Accuracy Score:  0.3076923076923077
Iteration: 3  Accuracy Score:  0.38461538461538464
Iteration: 4  Accuracy Score:  0.4166666666666667
Iteration: 5  Accuracy Score:  0.4166666666666667
Iteration: 6  Accuracy Score:  0.4166666666666667
Iteration: 7  Accuracy Score:  0.3333333333333333
Iteration: 8  Accuracy Score:  0.08333333333333333
Iteration: 9  Accuracy Score:  0.16666666666666666
Iteration: 10  Accuracy Score:  0.3333333333333333
Mean Accuracy:  0.3320512820512821


In [97]:
from sklearn.svm import LinearSVC

# Linear SVC Classifier Evaluation
linearSVC = LinearSVC()
parameters = {
                'loss': ('hinge', 'squared_hinge'),
                'multi_class': ('ovr', 'crammer_singer'),
                'C': [0.01, 0.1, 1.0, 2.0, 10.0, 100.0],
                'max_iter': [1000, 2000, 5000, 10000]
             }
linearSVCClassifier = GridSearchCV(linearSVC, parameters, n_jobs = -1, cv=10, scoring='accuracy')
# linearSVCClassifier.fit(X_trainVal_vect_avg, ytrainValLabels)
# print(linearSVCClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, linearSVCClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

# SVM Classifier

print("\n---- Word2Vec Results----\n")
# Word2Vec
accuracyArrayY, meanAccuracyY = TenFoldValidation_Word2Vec(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyY)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.46153846153846156

---- Word2Vec Results----

Iteration: 1  Accuracy Score:  0.46153846153846156
Iteration: 2  Accuracy Score:  0.38461538461538464
Iteration: 3  Accuracy Score:  0.46153846153846156
Iteration: 4  Accuracy Score:  0.4166666666666667
Iteration: 5  Accuracy Score:  0.4166666666666667
Iteration: 6  Accuracy Score:  0.25
Iteration: 7  Accuracy Score:  0.3333333333333333
Iteration: 8  Accuracy Score:  0.25
Iteration: 9  Accuracy Score:  0.25
Iteration: 10  Accuracy Score:  0.25
Mean Accuracy:  0.3474358974358974


In [51]:
from sklearn.gaussian_process import GaussianProcessClassifier

# Gaussian Process Classifier Evaluation
gaussianProcClas = GaussianProcessClassifier()
parameters = {
                'multi_class': ('one_vs_rest', 'one_vs_one'),
                'max_iter_predict': [100, 200, 300, 400, 500]
             }
gaussianProcClassifier = GridSearchCV(gaussianProcClas, parameters, n_jobs = -1, cv=10, scoring='accuracy')
# gaussianProcClassifier.fit(XTrainVal, ytrasinValLabels)
# print(gaussianProcClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, gaussianProcClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

# Gaussian Process Classifier

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
# gaussianProcClas = GaussianProcessClassifier()
accuracyArrayX, meanAccuracyX = TenFoldValidation_BowTfIdf(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyX)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.46153846153846156

---- Bow - Tf-Idf Results----

Iteration: 1  Accuracy Score:  0.46153846153846156
Iteration: 2  Accuracy Score:  0.46153846153846156
Iteration: 3  Accuracy Score:  0.3076923076923077
Iteration: 4  Accuracy Score:  0.4166666666666667
Iteration: 5  Accuracy Score:  0.25
Iteration: 6  Accuracy Score:  0.5
Iteration: 7  Accuracy Score:  0.16666666666666666
Iteration: 8  Accuracy Score:  0.5833333333333334
Iteration: 9  Accuracy Score:  0.25
Iteration: 10  Accuracy Score:  0.3333333333333333
Mean Accuracy:  0.37307692307692314


In [98]:
from sklearn.gaussian_process import GaussianProcessClassifier

# Gaussian Process Classifier Evaluation
gaussianProcClas = GaussianProcessClassifier()
parameters = {
                'multi_class': ('one_vs_rest', 'one_vs_one'),
                'max_iter_predict': [100, 200, 300, 400, 500]
             }
gaussianProcClassifier = GridSearchCV(gaussianProcClas, parameters, n_jobs = -1, cv=10, scoring='accuracy')
# gaussianProcClassifier.fit(X_trainVal_vect_avg, ytrainValLabels)
# print(gaussianProcClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, gaussianProcClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

# Gaussian Process Classifier

print("\n---- Word2Vec Results----\n")
# Word2Vec
# gaussianProcClas = GaussianProcessClassifier()
accuracyArrayY, meanAccuracyY = TenFoldValidation_Word2Vec(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyY)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.46153846153846156

---- Word2Vec Results----

Iteration: 1  Accuracy Score:  0.3076923076923077
Iteration: 2  Accuracy Score:  0.3076923076923077
Iteration: 3  Accuracy Score:  0.3076923076923077
Iteration: 4  Accuracy Score:  0.16666666666666666
Iteration: 5  Accuracy Score:  0.4166666666666667
Iteration: 6  Accuracy Score:  0.5833333333333334
Iteration: 7  Accuracy Score:  0.3333333333333333
Iteration: 8  Accuracy Score:  0.25
Iteration: 9  Accuracy Score:  0.25
Iteration: 10  Accuracy Score:  0.25
Mean Accuracy:  0.31730769230769235


In [60]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Classifier Evaluation

decisionTreeClas = DecisionTreeClassifier()
parameters = {
                'criterion': ('gini', 'entropy'),
                'splitter': ('best', 'random'),
                'max_features':  ('auto', 'sqrt', 'log2', 'None')
             }
decisionTreeClassifier = GridSearchCV(decisionTreeClas, parameters, n_jobs = -1, cv=20, scoring='accuracy')
# decisionTreeClassifier.fit(XTrainVal, ytrainValLabels)
# # print(decisionTreeClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, decisionTreeClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

# Decision Tree Classifier

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
decisionTreeClas = DecisionTreeClassifier()
accuracyArrayX, meanAccuracyX = TenFoldValidation_BowTfIdf(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyX)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.5

---- Bow - Tf-Idf Results----

Iteration: 1  Accuracy Score:  0.23076923076923078
Iteration: 2  Accuracy Score:  0.46153846153846156
Iteration: 3  Accuracy Score:  0.07692307692307693
Iteration: 4  Accuracy Score:  0.5833333333333334
Iteration: 5  Accuracy Score:  0.25
Iteration: 6  Accuracy Score:  0.3333333333333333
Iteration: 7  Accuracy Score:  0.3333333333333333
Iteration: 8  Accuracy Score:  0.4166666666666667
Iteration: 9  Accuracy Score:  0.4166666666666667
Iteration: 10  Accuracy Score:  0.3333333333333333
Mean Accuracy:  0.3435897435897436


In [99]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Classifier Evaluation

decisionTreeClas = DecisionTreeClassifier()
parameters = {
                'criterion': ('gini', 'entropy'),
                'splitter': ('best', 'random'),
                'max_features':  ('auto', 'sqrt', 'log2', 'None')
             }
decisionTreeClassifier = GridSearchCV(decisionTreeClas, parameters, n_jobs = -1, cv=10, scoring='accuracy')
# decisionTreeClassifier.fit(X_trainVal_vect_avg, ytrainValLabels)
# print(decisionTreeClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, decisionTreeClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

# Decision Tree Classifier

print("\n---- Word2Vec Results----\n")
# Word2Vec
decisionTreeClas = DecisionTreeClassifier()
accuracyArrayY, meanAccuracyY = TenFoldValidation_Word2Vec(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyY)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.5833333333333334

---- Word2Vec Results----

Iteration: 1  Accuracy Score:  0.5384615384615384
Iteration: 2  Accuracy Score:  0.23076923076923078
Iteration: 3  Accuracy Score:  0.46153846153846156
Iteration: 4  Accuracy Score:  0.3333333333333333
Iteration: 5  Accuracy Score:  0.6666666666666666
Iteration: 6  Accuracy Score:  0.4166666666666667
Iteration: 7  Accuracy Score:  0.25
Iteration: 8  Accuracy Score:  0.3333333333333333
Iteration: 9  Accuracy Score:  0.5833333333333334
Iteration: 10  Accuracy Score:  0.4166666666666667
Mean Accuracy:  0.4230769230769231


In [64]:
from sklearn.ensemble import AdaBoostClassifier

# Ada Boost Classifier Evaluation

adaBoostClas = AdaBoostClassifier()
parameters = {
                'n_estimators':  [10, 50, 100, 150, 200, 250, 300]
             }
adaBoostClassifier = GridSearchCV(adaBoostClas, parameters, n_jobs = -1, cv=2, scoring='accuracy')
# adaBoostClassifier.fit(XTrainVal, ytrainValLabels)
# print(adaBoostClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, adaBoostClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

# Ada Boost Classifier

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
adaBoostClas = AdaBoostClassifier()
accuracyArrayX, meanAccuracyX = TenFoldValidation_BowTfIdf(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyX)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.5384615384615384

---- Bow - Tf-Idf Results----

Iteration: 1  Accuracy Score:  0.5384615384615384
Iteration: 2  Accuracy Score:  0.38461538461538464
Iteration: 3  Accuracy Score:  0.15384615384615385
Iteration: 4  Accuracy Score:  0.3333333333333333
Iteration: 5  Accuracy Score:  0.25
Iteration: 6  Accuracy Score:  0.5833333333333334
Iteration: 7  Accuracy Score:  0.3333333333333333
Iteration: 8  Accuracy Score:  0.6666666666666666
Iteration: 9  Accuracy Score:  0.4166666666666667
Iteration: 10  Accuracy Score:  0.25
Mean Accuracy:  0.391025641025641


In [100]:
from sklearn.ensemble import AdaBoostClassifier

# Ada Boost Classifier Evaluation

adaBoostClas = AdaBoostClassifier()
parameters = {
                'n_estimators':  [50, 100, 150, 200]
             }
adaBoostClassifier = GridSearchCV(adaBoostClas, parameters, n_jobs = -1, cv=5, scoring='accuracy')
# adaBoostClassifier.fit(X_trainVal_vect_avg, ytrainValLabels)
# print(adaBoostClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, adaBoostClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

# Ada Boost Classifier

print("\n---- Word2Vec Results----\n")
# Word2Vec
adaBoostClas = AdaBoostClassifier()
accuracyArrayY, meanAccuracyY = TenFoldValidation_Word2Vec(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyY)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.6923076923076923

---- Word2Vec Results----

Iteration: 1  Accuracy Score:  0.5384615384615384
Iteration: 2  Accuracy Score:  0.23076923076923078
Iteration: 3  Accuracy Score:  0.46153846153846156
Iteration: 4  Accuracy Score:  0.5
Iteration: 5  Accuracy Score:  0.25
Iteration: 6  Accuracy Score:  0.25
Iteration: 7  Accuracy Score:  0.5
Iteration: 8  Accuracy Score:  0.4166666666666667
Iteration: 9  Accuracy Score:  0.6666666666666666
Iteration: 10  Accuracy Score:  0.6666666666666666
Mean Accuracy:  0.4480769230769231


In [73]:
from sklearn.naive_bayes import MultinomialNB

# Multinomial Naive Bayes Classifier
multiNBClas = MultinomialNB()
parameters = {
                'alpha': [1, 0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
             }
multiNBClassifier = GridSearchCV(multiNBClas, parameters, n_jobs = -1, cv=2, scoring='accuracy')
# multiNBClassifier.fit(XTrainVal, ytrainValLabels)
# print(multiNBClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, multiNBClassifier)
print("Best AccuracyVal: ",bestAccuracyVal)

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
accuracyArrayX, meanAccuracyX = TenFoldValidation_BowTfIdf(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyX)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best AccuracyVal:  0.46153846153846156

---- Bow - Tf-Idf Results----

Iteration: 1  Accuracy Score:  0.46153846153846156
Iteration: 2  Accuracy Score:  0.46153846153846156
Iteration: 3  Accuracy Score:  0.23076923076923078
Iteration: 4  Accuracy Score:  0.6666666666666666
Iteration: 5  Accuracy Score:  0.25
Iteration: 6  Accuracy Score:  0.5
Iteration: 7  Accuracy Score:  0.25
Iteration: 8  Accuracy Score:  0.5833333333333334
Iteration: 9  Accuracy Score:  0.25
Iteration: 10  Accuracy Score:  0.3333333333333333
Mean Accuracy:  0.39871794871794874


In [77]:
from sklearn.linear_model import SGDClassifier

# SGD Classifier

sgdClas = SGDClassifier()
parameters = {
                'penalty': ('l2', 'l1', 'elasticnet'),
                'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
                'max_iter' : [500, 1000, 10000],
                'tol': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
                'n_iter_no_change': [5, 10, 15, 20],
                'early_stopping': (True, False)
             }
sgdClassifier = GridSearchCV(sgdClas, parameters, n_jobs = -1, cv=10, scoring='accuracy')
# sgdClassifier.fit(XTrainVal, ytrainValLabels)
# print(adaBoostClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, sgdClassifier)
print("Best AccuracyVal: ",bestAccuracyVal)

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
# sgdClas = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=500, tol=None)
accuracyArrayX, meanAccuracyX = TenFoldValidation_BowTfIdf(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyX)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best AccuracyVal:  0.6153846153846154

---- Bow - Tf-Idf Results----

Iteration: 1  Accuracy Score:  0.46153846153846156
Iteration: 2  Accuracy Score:  0.5384615384615384
Iteration: 3  Accuracy Score:  0.3076923076923077
Iteration: 4  Accuracy Score:  0.6666666666666666
Iteration: 5  Accuracy Score:  0.08333333333333333
Iteration: 6  Accuracy Score:  0.4166666666666667
Iteration: 7  Accuracy Score:  0.25
Iteration: 8  Accuracy Score:  0.5833333333333334
Iteration: 9  Accuracy Score:  0.25
Iteration: 10  Accuracy Score:  0.4166666666666667
Mean Accuracy:  0.39743589743589747


In [101]:
from sklearn.linear_model import SGDClassifier

# SGD Classifier

sgdClas = SGDClassifier()
parameters = {
                'penalty': ('l2', 'l1', 'elasticnet'),
                'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
                'max_iter' : [500, 1000, 10000],
                'tol': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
                'n_iter_no_change': [5, 10, 15, 20],
                'early_stopping': (True, False)
             }
sgdClassifier = GridSearchCV(sgdClas, parameters, n_jobs = -1, cv=10, scoring='accuracy')
# sgdClassifier.fit(X_trainVal_vect_avg, ytrainValLabels)
# print(adaBoostClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, sgdClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

print("\n---- Word2Vec Results----\n")
# Word2Vec
# sgdClas = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=500, tol=None)
accuracyArrayY, meanAccuracyY = TenFoldValidation_Word2Vec(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyY)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.5

---- Word2Vec Results----

Iteration: 1  Accuracy Score:  0.3076923076923077
Iteration: 2  Accuracy Score:  0.3076923076923077
Iteration: 3  Accuracy Score:  0.3076923076923077
Iteration: 4  Accuracy Score:  0.3333333333333333
Iteration: 5  Accuracy Score:  0.3333333333333333
Iteration: 6  Accuracy Score:  0.3333333333333333
Iteration: 7  Accuracy Score:  0.3333333333333333
Iteration: 8  Accuracy Score:  0.3333333333333333
Iteration: 9  Accuracy Score:  0.3333333333333333
Iteration: 10  Accuracy Score:  0.3333333333333333
Mean Accuracy:  0.32564102564102565


In [90]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression Evaluation

logRegClas = LogisticRegression(multi_class='ovr')
parameters = {
                'penalty': ('l2', 'none'),
                'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
                'max_iter': [100, 200, 500, 1000]
             }
logRegClassifier = GridSearchCV(logRegClas, parameters, n_jobs = -1, cv=20, scoring='accuracy')
# logRegClassifier.fit(XTrainVal, ytrainValLabels)
# print(logRegClassifier.best_estimator_)


bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, logRegClassifier)
print("Best AccuracyVal: ",bestAccuracyVal)

# Logistic Regression

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
logRegClas = LogisticRegression()
accuracyArrayX, meanAccuracyX = TenFoldValidation_BowTfIdf(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyX)

# print("\n---- Word2Vec Results----\n")
# # Word2Vec
# logRegClas = LogisticRegression()
# accuracyArrayX, meanAccuracyX = TenFoldValidation_Word2Vec(summarisationData, logRegClassifier.best_estimator_)
# print("Mean Accuracy: ", meanAccuracyY)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best AccuracyVal:  0.46153846153846156

---- Bow - Tf-Idf Results----

Iteration: 1  Accuracy Score:  0.46153846153846156
Iteration: 2  Accuracy Score:  0.5384615384615384
Iteration: 3  Accuracy Score:  0.23076923076923078
Iteration: 4  Accuracy Score:  0.3333333333333333
Iteration: 5  Accuracy Score:  0.25
Iteration: 6  Accuracy Score:  0.4166666666666667
Iteration: 7  Accuracy Score:  0.25
Iteration: 8  Accuracy Score:  0.5833333333333334
Iteration: 9  Accuracy Score:  0.25
Iteration: 10  Accuracy Score:  0.4166666666666667
Mean Accuracy:  0.37307692307692303


In [102]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression Evaluation

logRegClas = LogisticRegression(multi_class='ovr')
parameters = {
                'penalty': ('l2', 'none'),
                'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
                'max_iter': [100, 200, 500, 1000]
             }
logRegClassifier = GridSearchCV(logRegClas, parameters, n_jobs = -1, cv=10, scoring='accuracy')

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, logRegClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

print("\n---- Word2Vec Results----\n")
# Word2Vec
accuracyArrayX, meanAccuracyX = TenFoldValidation_Word2Vec(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyY)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.46153846153846156

---- Word2Vec Results----

Iteration: 1  Accuracy Score:  0.3076923076923077
Iteration: 2  Accuracy Score:  0.3076923076923077
Iteration: 3  Accuracy Score:  0.3076923076923077
Iteration: 4  Accuracy Score:  0.3333333333333333
Iteration: 5  Accuracy Score:  0.4166666666666667
Iteration: 6  Accuracy Score:  0.4166666666666667
Iteration: 7  Accuracy Score:  0.3333333333333333
Iteration: 8  Accuracy Score:  0.3333333333333333
Iteration: 9  Accuracy Score:  0.25
Iteration: 10  Accuracy Score:  0.4166666666666667
Mean Accuracy:  0.32564102564102565


In [80]:
from sklearn.linear_model import RidgeClassifier

# Ridge Classifier Evaluation

ridgeClas = RidgeClassifier()
parameters = {
                'alpha': [0.05, 0.1, 0.5, 1, 2, 5, 10, 20],
                'tol' : [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
                'solver': ('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs')
             }
ridgeClassifier = GridSearchCV(ridgeClas, parameters, n_jobs = -1, cv=10, scoring='accuracy')
# ridgeClassifier.fit(XTrainVal, ytrainValLabels)
# print(ridgeClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, ridgeClassifier)
print("Best AccuracyVal: ",bestAccuracyVal)

# Logistic Regression

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
accuracyArrayX, meanAccuracyX = TenFoldValidation_BowTfIdf(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyX)

# print("\n---- Word2Vec Results----\n")
# Word2Vec
# accuracyArrayX, meanAccuracyX = TenFoldValidation_Word2Vec(summarisationData, ridgeClassifier.best_estimator_)
# print("Mean Accuracy: ", meanAccuracyY)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best AccuracyVal:  0.38461538461538464

---- Bow - Tf-Idf Results----

Iteration: 1  Accuracy Score:  0.5384615384615384
Iteration: 2  Accuracy Score:  0.46153846153846156
Iteration: 3  Accuracy Score:  0.15384615384615385
Iteration: 4  Accuracy Score:  0.3333333333333333
Iteration: 5  Accuracy Score:  0.16666666666666666
Iteration: 6  Accuracy Score:  0.3333333333333333
Iteration: 7  Accuracy Score:  0.16666666666666666
Iteration: 8  Accuracy Score:  0.5
Iteration: 9  Accuracy Score:  0.3333333333333333
Iteration: 10  Accuracy Score:  0.3333333333333333
Mean Accuracy:  0.3320512820512821


In [103]:
from sklearn.linear_model import RidgeClassifier

# Ridge Classifier Evaluation

ridgeClas = RidgeClassifier()
parameters = {
                'alpha': [0.05, 0.1, 0.5, 1, 2, 5, 10, 20],
                'tol' : [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
                'solver': ('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs')
             }
ridgeClassifier = GridSearchCV(ridgeClas, parameters, n_jobs = -1, cv=10, scoring='accuracy')

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, ridgeClassifier)
print("Best Accuracy Test: ",bestAccuracyTest)
print("Best Accuracy Val: ",bestAccuracyVal)

print("\n---- Word2Vec Results----\n")
Word2Vec
accuracyArrayX, meanAccuracyX = TenFoldValidation_Word2Vec(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyY)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best Accuracy Test:  0.0
Best Accuracy Val:  0.38461538461538464

---- Word2Vec Results----

Iteration: 1  Accuracy Score:  0.38461538461538464
Iteration: 2  Accuracy Score:  0.38461538461538464
Iteration: 3  Accuracy Score:  0.46153846153846156
Iteration: 4  Accuracy Score:  0.16666666666666666
Iteration: 5  Accuracy Score:  0.4166666666666667
Iteration: 6  Accuracy Score:  0.5833333333333334
Iteration: 7  Accuracy Score:  0.3333333333333333
Iteration: 8  Accuracy Score:  0.25
Iteration: 9  Accuracy Score:  0.16666666666666666
Iteration: 10  Accuracy Score:  0.25
Mean Accuracy:  0.32564102564102565


In [83]:
from sklearn.linear_model import Perceptron

# Perceptro Evaluation

perceptronClas = Perceptron()
parameters = {
                'penalty': ('l2', 'l1', 'elasticnet'),
                'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
                'max_iter' : [500, 1000, 10000],
                'tol': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
                'n_iter_no_change': [5, 10, 15, 20]
             }
perceptronClassifier = GridSearchCV(perceptronClas, parameters, n_jobs = -1, cv=5, scoring='accuracy')
# perceptronClassifier.fit(XTrainVal, ytrainValLabels)
# print(perceptronClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, perceptronClassifier)
print("Best AccuracyVal: ",bestAccuracyVal)

# Logistic Regression

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
accuracyArrayX, meanAccuracyX = TenFoldValidation_BowTfIdf(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyX)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best AccuracyVal:  0.4166666666666667

---- Bow - Tf-Idf Results----

Iteration: 1  Accuracy Score:  0.5384615384615384
Iteration: 2  Accuracy Score:  0.46153846153846156
Iteration: 3  Accuracy Score:  0.23076923076923078
Iteration: 4  Accuracy Score:  0.5833333333333334
Iteration: 5  Accuracy Score:  0.25
Iteration: 6  Accuracy Score:  0.3333333333333333
Iteration: 7  Accuracy Score:  0.16666666666666666
Iteration: 8  Accuracy Score:  0.5
Iteration: 9  Accuracy Score:  0.16666666666666666
Iteration: 10  Accuracy Score:  0.16666666666666666
Mean Accuracy:  0.3397435897435897


In [105]:
from sklearn.linear_model import Perceptron

# Perceptro Evaluation

perceptronClas = Perceptron()
parameters = {
                'penalty': ('l2', 'l1', 'elasticnet'),
                'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
                'max_iter' : [500, 1000, 10000],
                'tol': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
                'n_iter_no_change': [5, 10, 15, 20]
             }
perceptronClassifier = GridSearchCV(perceptronClas, parameters, n_jobs = -1, cv=10, scoring='accuracy')
# perceptronClassifier.fit(X_trainVal_vect_avg, ytrainValLabels)
# print(perceptronClassifier.best_estimator_)

bestModel, bestAccuracyTest, bestAccuracyVal = HyperParameterTuning(summarisationData, perceptronClassifier)
print("Best AccuracyVal: ",bestAccuracyVal)

# Logistic Regression

print("\n---- Word2Vec Results----\n")
# Word2Vec
accuracyArrayX, meanAccuracyX = TenFoldValidation_Word2Vec(summarisationData, bestModel)
print("Mean Accuracy: ", meanAccuracyY)

Fold  1  of tuning.
Fold  2  of tuning.
Fold  3  of tuning.
Fold  4  of tuning.
Fold  5  of tuning.
Fold  6  of tuning.
Fold  7  of tuning.
Fold  8  of tuning.
Fold  9  of tuning.
Fold  10  of tuning.
Best AccuracyVal:  0.5833333333333334

---- Word2Vec Results----

Iteration: 1  Accuracy Score:  0.38461538461538464
Iteration: 2  Accuracy Score:  0.3076923076923077
Iteration: 3  Accuracy Score:  0.38461538461538464
Iteration: 4  Accuracy Score:  0.3333333333333333
Iteration: 5  Accuracy Score:  0.3333333333333333
Iteration: 6  Accuracy Score:  0.3333333333333333
Iteration: 7  Accuracy Score:  0.3333333333333333
Iteration: 8  Accuracy Score:  0.3333333333333333
Iteration: 9  Accuracy Score:  0.3333333333333333
Iteration: 10  Accuracy Score:  0.3333333333333333
Mean Accuracy:  0.32564102564102565
