In [12]:
import json
import pandas as pd
import numpy as np
from nltk.stem import SnowballStemmer

In [13]:
# Read Summarisation Model results from json file.

with open("../data/Model_Results/summarisation_results.jsonl", 'r') as json_file:
    json_list = list(json_file)

    
summarisationResults = {}
for json_str in json_list:
    
    result = json.loads(json_str)
    summaryList = sorted(result["sentence_scores"], key=lambda x:x[1], reverse=True)[:4]
    summary = ""
    for sentence in summaryList:
        summary += sentence[0] + ' '
    
    summarisationResults[result["id"]] = summary

In [14]:
summarisationData = pd.DataFrame(data=summarisationResults, index=[0])

In [15]:
summarisationDataDict = {}

for json_id in summarisationData.columns:
    
    features = {}
    
    features["summary"] = summarisationData[json_id].values[0]
    
    summarisationDataDict[json_id] = features

In [16]:
summarisationData = pd.DataFrame().from_dict(data=summarisationDataDict)

display(summarisationData)

Unnamed: 0,11972.json,11685.json,11096.json,5209.json,9524.json,5962.json,7070.json,1046.json,12849.json,13270.json,...,11576.json,3461.json,9464.json,10227.json,11707.json,3425.json,2977.json,294.json,3580.json,8384.json
summary,"""Building a wall"" on the border ""will take lit...","Those numbers show that as of October 2015, th...","If he did, he would have known that Senator Mc...","""…She supports taking $500 billion away from M...","Scott Walker helped run a ""criminal scheme"" to...",The campaign accurately quoted a figure whose ...,"So $30-31 million per year would, in fact, be ...",The Obama administration has emphasized many o...,"""It matters who’s leading the country, and it ...","Pence described the donors as major. ""The nati...",...,We’re not sure Sanders made that entirely clea...,"The $20 million designated for Cuba ""focuses o...",There also are no Asian or Pacific Islander Re...,"""The United States is in the longest stretch o...","""Secretary Clinton changes her position on thi...","Under the header ""New jobs created by the Stre...","They said low pay, increased work demands and ...","• Comstock, an adviser and frequent spokeswoma...","""House Republicans under Paul Ryan's leadershi...","""I will work in a bipartisan way to get it don..."


In [17]:
x_trainDf = pd.read_csv ('../data/Train_Eval_Test_Data/train.tsv', sep='\t')
x_trainDf.columns = ["json_id", "claim", "justification_label"]

x_testDf = pd.read_csv ('../data/Train_Eval_Test_Data/test.tsv', sep='\t')
x_testDf.columns = ["json_id", "claim", "justification_label"]

x_valDf = pd.read_csv ('../data/Train_Eval_Test_Data/val.tsv', sep='\t')
x_valDf.columns = ["json_id", "claim", "justification_label"]

FileNotFoundError: [Errno 2] No such file or directory: '../data/Train_Eval_Test_Data/train.tsv'

In [452]:
x_trainDf["summary"] = ""
x_testDf["summary"]  = ""
x_valDf["summary"]   = ""

In [18]:
def addSummary(data, summaries):
    
    for index, row in data.iterrows():
        if row["json_id"] in summaries.columns:
            row["summary"] = summaries[row["json_id"]].values[0]

In [454]:
addSummary(x_trainDf, summarisationData)
addSummary(x_testDf, summarisationData)
addSummary(x_valDf, summarisationData)

In [455]:
y_trainDf = pd.DataFrame(data=x_trainDf["justification_label"])
y_testDf  = pd.DataFrame(data=x_testDf["justification_label"])
y_valDf   = pd.DataFrame(data=x_valDf["justification_label"])

In [19]:
def cleanAndStemm(sentence):
    
    sb = SnowballStemmer("english")
    
    sentenceList = []
    for token in sentence.split(' '):
        
        token = token.lower()
        token = sb.stem(token)
        
        sentenceList.append(token)
        
    newSentence = ' '.join(sentenceList)
    return newSentence

In [457]:
# Create list containing claims + attention explanations

trainClaimSumm  = []
yTrainLabels   = y_trainDf['justification_label'].tolist()

testClaimSumm   = []
yTestLabels    = y_testDf['justification_label'].tolist()

evalClaimSumm   = []
yEvalLabels    = y_valDf['justification_label'].tolist()

for index, row in x_trainDf.iterrows():
    trainClaimSumm.append(cleanAndStemm(row['claim']) + cleanAndStemm(row['summary']))

for index, row in x_testDf.iterrows():
    testClaimSumm.append(cleanAndStemm(row['claim']) + cleanAndStemm(row['summary']))

for index, row in x_valDf.iterrows():
    evalClaimSumm.append(cleanAndStemm(row['claim']) + cleanAndStemm(row['summary']))
    
totalClaimSumm = trainClaimSumm + testClaimSumm + evalClaimSumm
yTotalLabels = np.array(yTrainLabels + yTestLabels + yEvalLabels)

In [20]:
import nltk
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report, accuracy_score

In [459]:
# Convert words to number using Bag of Words (Bow)
vectorizer = CountVectorizer(max_features=30)
vectorizedTrain = vectorizer.fit_transform(trainClaimSumm).toarray()
vectorizedTest  = vectorizer.fit_transform(testClaimSumm).toarray()
vectorizedEval  = vectorizer.fit_transform(evalClaimSumm).toarray()
vectorizedTotal = vectorizer.fit_transform(totalClaimSumm).toarray()

# Convert Bow values according to TfIdf
tfidfconverter = TfidfTransformer()
XTrain = tfidfconverter.fit_transform(vectorizedTrain).toarray()
XTest  = tfidfconverter.fit_transform(vectorizedTest).toarray()
XEval  = tfidfconverter.fit_transform(vectorizedEval).toarray()
XTotal = tfidfconverter.fit_transform(vectorizedTotal).toarray()

In [460]:
# Transform the data into a list of lists for the Word2Vec model

w2vTrainClaimSum = []
w2vTestClaimSum  = []
w2vEvalClaimSum  = []

for index, row in x_trainDf.iterrows():
    w2vTrainClaimSum.append((row['claim'] + row['summary']).split(" "))

for index, row in x_testDf.iterrows():
    w2vTestClaimSum.append((row['claim'] + row['summary']).split(" "))

for index, row in x_valDf.iterrows():
    w2vEvalClaimSum.append((row['claim'] + row['summary']).split(" "))


In [461]:
# Vectorize using Word2Vec

# min_count = 3 & size = 40 & window=8 & sg=1 

model = Word2Vec(sentences=w2vTrainClaimSum, min_count=3,size=40,workers=4, window=8, sg=1)

words = set(model.wv.index2word)
X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in words])
                         for ls in w2vTrainClaimSum])
X_test_vect = np.array([np.array([model.wv[i] for i in ls if i in words])
                         for ls in w2vTestClaimSum])

X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

  X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([model.wv[i] for i in ls if i in words])


In [102]:
def TenFoldValidation_BowTfIdf(summarisationData, classifier):
    
    accuracyResults = np.zeros([10])
    
    for counter in range(10):
        x_trainDf = pd.read_csv ('../data/Train_Eval_Test_Data/Iteration' + str(counter + 1) + '/train' + str(counter + 1) + '.tsv', sep='\t')
        x_trainDf.columns = ["json_id", "claim", "justification_label"]

        x_testDf = pd.read_csv ('../data/Train_Eval_Test_Data/Iteration' + str(counter + 1) + '/test' + str(counter + 1) + '.tsv', sep='\t')
        x_testDf.columns = ["json_id", "claim", "justification_label"]
        
        x_trainDf["summary"] = ""
        x_testDf["summary"]  = ""
        
        addSummary(x_trainDf, summarisationData)
        addSummary(x_testDf, summarisationData)
        
        y_trainDf = pd.DataFrame(data=x_trainDf["justification_label"])
        y_testDf  = pd.DataFrame(data=x_testDf["justification_label"])
        
        trainClaimSumm  = []
        yTrainLabels   = y_trainDf['justification_label'].tolist()

        testClaimSumm   = []
        yTestLabels    = y_testDf['justification_label'].tolist()

        for index, row in x_trainDf.iterrows():
            trainClaimSumm.append(cleanAndStemm(row['claim']) + cleanAndStemm(row['summary']))

        for index, row in x_testDf.iterrows():
            testClaimSumm.append(cleanAndStemm(row['claim']) + cleanAndStemm(row['summary']))

            
        # Convert words to number using Bag of Words (Bow)
        vectorizer = CountVectorizer(max_features=30)
        vectorizedTrain = vectorizer.fit_transform(trainClaimSumm).toarray()
        vectorizedTest  = vectorizer.fit_transform(testClaimSumm).toarray()

        # Convert Bow values according to TfIdf
        tfidfconverter = TfidfTransformer()
        XTrain = tfidfconverter.fit_transform(vectorizedTrain).toarray()
        XTest  = tfidfconverter.fit_transform(vectorizedTest).toarray()

        classifier.fit(XTrain,yTrainLabels)

        yPredLabels = classifier.predict(XTest)
        
        accuracyScore = accuracy_score(yTestLabels,yPredLabels)
        accuracyResults[counter] = accuracyScore

#         ConfusionMatrixDisplay.from_predictions(yTestLabels, yPredLabels, cmap='Greens')
#         print(classification_report(yTestLabels,yPredLabels))
        print("Iteration:",counter+1," Accuracy Score: ",accuracyScore)
    
    return accuracyResults

In [92]:
def TenFoldValidation_Word2Vec(summarisationData, classifier):
    
    accuracyResults = np.zeros([10])

    for counter in range(10):
        x_trainDf = pd.read_csv ('../data/Train_Eval_Test_Data/Iteration' + str(counter + 1) + '/train' + str(counter + 1) + '.tsv', sep='\t')
        x_trainDf.columns = ["json_id", "claim", "justification_label"]

        x_testDf = pd.read_csv ('../data/Train_Eval_Test_Data/Iteration' + str(counter + 1) + '/test' + str(counter + 1) + '.tsv', sep='\t')
        x_testDf.columns = ["json_id", "claim", "justification_label"]
        
        x_trainDf["summary"] = ""
        x_testDf["summary"]  = ""
        
        addSummary(x_trainDf, summarisationData)
        addSummary(x_testDf, summarisationData)
        
        y_trainDf = pd.DataFrame(data=x_trainDf["justification_label"])
        y_testDf  = pd.DataFrame(data=x_testDf["justification_label"])
        
        yTrainLabels   = y_trainDf['justification_label'].tolist()
        yTestLabels    = y_testDf['justification_label'].tolist()
        
        # Transform the data into a list of lists for the Word2Vec model
        w2vTrainClaimSum = []
        w2vTestClaimSum  = []

        for index, row in x_trainDf.iterrows():
            w2vTrainClaimSum.append((row['claim'] + row['summary']).split(" "))

        for index, row in x_testDf.iterrows():
            w2vTestClaimSum.append((row['claim'] + row['summary']).split(" "))

            
        # Vectorize using Word2Vec

        # min_count = 3 & size = 40 & window=8 & sg=1 

        model = Word2Vec(sentences=w2vTrainClaimSum, min_count=3,size=40,workers=4, window=8, sg=1)

        words = set(model.wv.index2word)
        X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in words])
                                 for ls in w2vTrainClaimSum])
        X_test_vect = np.array([np.array([model.wv[i] for i in ls if i in words])
                                 for ls in w2vTestClaimSum])

        X_train_vect_avg = []
        for v in X_train_vect:
            if v.size:
                X_train_vect_avg.append(v.mean(axis=0))
            else:
                X_train_vect_avg.append(np.zeros(100, dtype=float))

        X_test_vect_avg = []
        for v in X_test_vect:
            if v.size:
                X_test_vect_avg.append(v.mean(axis=0))
            else:
                X_test_vect_avg.append(np.zeros(100, dtype=float))
        
        classifier.fit(X_train_vect_avg,yTrainLabels)

        yPredLabels = classifier.predict(X_test_vect_avg)

#         ConfusionMatrixDisplay.from_predictions(yTestLabels, yPredLabels, cmap='Greens')
#         print(classification_report(yTestLabels,yPredLabels))

        accuracyScore = accuracy_score(yTestLabels,yPredLabels)
        accuracyResults[counter] = accuracyScore
        
        print("Iteration:",counter+1," Accuracy Score: ",accuracyScore)
        
    return accuracyResults

In [58]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
from sklearn.dummy import DummyClassifier

# Dummy Classifier With Method Stratified
dummy_clf = DummyClassifier(strategy="stratified")
TenFoldValidation_BowTfIdf(summarisationData, dummy_clf)

Iteration: 1  Accuracy Score:  0.4
Iteration: 2  Accuracy Score:  0.4
Iteration: 3  Accuracy Score:  0.4
Iteration: 4  Accuracy Score:  0.5
Iteration: 5  Accuracy Score:  0.4
Iteration: 6  Accuracy Score:  0.3
Iteration: 7  Accuracy Score:  0.4
Iteration: 8  Accuracy Score:  0.4
Iteration: 9  Accuracy Score:  0.2
Iteration: 10  Accuracy Score:  0.4


In [76]:
# Word 2 Vec
from sklearn.dummy import DummyClassifier

# Dummy Classifier With Method Stratified
dummy_clf = DummyClassifier(strategy="stratified")
TenFoldValidation_Word2Vec(summarisationData, dummy_clf)

Iteration: 1  Accuracy Score:  0.3
Iteration: 2  Accuracy Score:  0.4
Iteration: 3  Accuracy Score:  0.5
Iteration: 4  Accuracy Score:  0.2
Iteration: 5  Accuracy Score:  0.2
Iteration: 6  Accuracy Score:  0.4
Iteration: 7  Accuracy Score:  0.2
Iteration: 8  Accuracy Score:  0.2
Iteration: 9  Accuracy Score:  0.3
Iteration: 10  Accuracy Score:  0.3


In [103]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classification Method

# Bow - Tf-Idf
print("\n---- Bow - Tf-Idf Results----\n")
randForestClas = RandomForestClassifier(n_estimators=1000, random_state=0)
accuracyArrayX = TenFoldValidation_BowTfIdf(summarisationData, randForestClas)
print(accuracyArrayX)
# Word2Vec
print("\n---- Word2Vec Results----\n")
randForestClas = RandomForestClassifier(n_estimators=1000, random_state=0)
accuracyArrayY = TenFoldValidation_Word2Vec(summarisationData, randForestClas)

# from scipy.stats import wilcoxon
# stat, p = wilcoxon(accuracyArrayX, accuracyArrayY)
# print('Statistics=%.3f, p=%.3f' % (stat, p))


---- Bow - Tf-Idf Results----

Iteration: 1  Accuracy Score:  0.5
Iteration: 2  Accuracy Score:  0.4
Iteration: 3  Accuracy Score:  0.1
Iteration: 4  Accuracy Score:  0.4
Iteration: 5  Accuracy Score:  0.3
Iteration: 6  Accuracy Score:  0.3
Iteration: 7  Accuracy Score:  0.2
Iteration: 8  Accuracy Score:  0.4
Iteration: 9  Accuracy Score:  0.1
Iteration: 10  Accuracy Score:  0.2
[0.5 0.4 0.1 0.4 0.3 0.3 0.2 0.4 0.1 0.2]

---- Word2Vec Results----

Iteration: 1  Accuracy Score:  0.3
Iteration: 2  Accuracy Score:  0.3
Iteration: 3  Accuracy Score:  0.4
Iteration: 4  Accuracy Score:  0.1
Iteration: 5  Accuracy Score:  0.3
Iteration: 6  Accuracy Score:  0.3
Iteration: 7  Accuracy Score:  0.4
Iteration: 8  Accuracy Score:  0.5
Iteration: 9  Accuracy Score:  0.4
Iteration: 10  Accuracy Score:  0.2
Statistics=11.500, p=0.670


In [42]:
from sklearn.neighbors import KNeighborsClassifier

# K-Neighbors Classifier
kNeighborsClas = KNeighborsClassifier(n_neighbors=10)
TenFoldValidation_BowTfIdf(summarisationData, kNeighborsClas)

Iteration: 1  Accuracy Score:  0.3
Iteration: 2  Accuracy Score:  0.6
Iteration: 3  Accuracy Score:  0.4
Iteration: 4  Accuracy Score:  0.2
Iteration: 5  Accuracy Score:  0.5
Iteration: 6  Accuracy Score:  0.4
Iteration: 7  Accuracy Score:  0.3
Iteration: 8  Accuracy Score:  0.5
Iteration: 9  Accuracy Score:  0.2
Iteration: 10  Accuracy Score:  0.3


In [63]:
# Word2Vec

from sklearn.neighbors import KNeighborsClassifier

# K-Neighbors Classifier
kNeighborsClas = KNeighborsClassifier(n_neighbors=2)
TenFoldValidation_Word2Vec(summarisationData, kNeighborsClas)

Iteration: 1  Accuracy Score:  0.2
Iteration: 2  Accuracy Score:  0.3
Iteration: 3  Accuracy Score:  0.5
Iteration: 4  Accuracy Score:  0.5
Iteration: 5  Accuracy Score:  0.3
Iteration: 6  Accuracy Score:  0.3
Iteration: 7  Accuracy Score:  0.5
Iteration: 8  Accuracy Score:  0.4
Iteration: 9  Accuracy Score:  0.5
Iteration: 10  Accuracy Score:  0.2


In [43]:
from sklearn.svm import SVC

# SVM Classifier
svmClas = SVC()
TenFoldValidation_BowTfIdf(summarisationData, svmClas)

Iteration: 1  Accuracy Score:  0.5
Iteration: 2  Accuracy Score:  0.7
Iteration: 3  Accuracy Score:  0.1
Iteration: 4  Accuracy Score:  0.4
Iteration: 5  Accuracy Score:  0.2
Iteration: 6  Accuracy Score:  0.2
Iteration: 7  Accuracy Score:  0.2
Iteration: 8  Accuracy Score:  0.3
Iteration: 9  Accuracy Score:  0.2
Iteration: 10  Accuracy Score:  0.3


In [64]:
# Word2Vec

from sklearn.svm import SVC

# SVM Classifier
svmClas = SVC()
TenFoldValidation_Word2Vec(summarisationData, svmClas)

Iteration: 1  Accuracy Score:  0.2
Iteration: 2  Accuracy Score:  0.3
Iteration: 3  Accuracy Score:  0.3
Iteration: 4  Accuracy Score:  0.2
Iteration: 5  Accuracy Score:  0.3
Iteration: 6  Accuracy Score:  0.1
Iteration: 7  Accuracy Score:  0.3
Iteration: 8  Accuracy Score:  0.3
Iteration: 9  Accuracy Score:  0.1
Iteration: 10  Accuracy Score:  0.3


In [44]:
from sklearn.gaussian_process import GaussianProcessClassifier

# Gaussian Process Classifier
gaussianProcClas = GaussianProcessClassifier()
TenFoldValidation_BowTfIdf(summarisationData, gaussianProcClas)

Iteration: 1  Accuracy Score:  0.6
Iteration: 2  Accuracy Score:  0.7
Iteration: 3  Accuracy Score:  0.2
Iteration: 4  Accuracy Score:  0.2
Iteration: 5  Accuracy Score:  0.3
Iteration: 6  Accuracy Score:  0.2
Iteration: 7  Accuracy Score:  0.1
Iteration: 8  Accuracy Score:  0.3
Iteration: 9  Accuracy Score:  0.2
Iteration: 10  Accuracy Score:  0.1


In [65]:
# Word2Vec

from sklearn.gaussian_process import GaussianProcessClassifier

# Gaussian Process Classifier
gaussianProcClas = GaussianProcessClassifier()
TenFoldValidation_Word2Vec(summarisationData, gaussianProcClas)

Iteration: 1  Accuracy Score:  0.2
Iteration: 2  Accuracy Score:  0.3
Iteration: 3  Accuracy Score:  0.3
Iteration: 4  Accuracy Score:  0.2
Iteration: 5  Accuracy Score:  0.3
Iteration: 6  Accuracy Score:  0.1
Iteration: 7  Accuracy Score:  0.3
Iteration: 8  Accuracy Score:  0.3
Iteration: 9  Accuracy Score:  0.1
Iteration: 10  Accuracy Score:  0.3


In [45]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Classifier
decisionTreeClas = DecisionTreeClassifier()
TenFoldValidation_BowTfIdf(summarisationData, decisionTreeClas)

Iteration: 1  Accuracy Score:  0.5
Iteration: 2  Accuracy Score:  0.5
Iteration: 3  Accuracy Score:  0.1
Iteration: 4  Accuracy Score:  0.4
Iteration: 5  Accuracy Score:  0.4
Iteration: 6  Accuracy Score:  0.3
Iteration: 7  Accuracy Score:  0.1
Iteration: 8  Accuracy Score:  0.2
Iteration: 9  Accuracy Score:  0.5
Iteration: 10  Accuracy Score:  0.2


In [66]:
# Word2Vec

from sklearn.tree import DecisionTreeClassifier

# Decision Tree Classifier
decisionTreeClas = DecisionTreeClassifier()
TenFoldValidation_Word2Vec(summarisationData, decisionTreeClas)

Iteration: 1  Accuracy Score:  0.6
Iteration: 2  Accuracy Score:  0.0
Iteration: 3  Accuracy Score:  0.5
Iteration: 4  Accuracy Score:  0.2
Iteration: 5  Accuracy Score:  0.5
Iteration: 6  Accuracy Score:  0.4
Iteration: 7  Accuracy Score:  0.4
Iteration: 8  Accuracy Score:  0.3
Iteration: 9  Accuracy Score:  0.4
Iteration: 10  Accuracy Score:  0.4


In [46]:
from sklearn.ensemble import AdaBoostClassifier

# Ada Boost Classifier
adaBoostClas = AdaBoostClassifier()
TenFoldValidation_BowTfIdf(summarisationData, adaBoostClas)

Iteration: 1  Accuracy Score:  0.0
Iteration: 2  Accuracy Score:  0.5
Iteration: 3  Accuracy Score:  0.2
Iteration: 4  Accuracy Score:  0.3
Iteration: 5  Accuracy Score:  0.3
Iteration: 6  Accuracy Score:  0.1
Iteration: 7  Accuracy Score:  0.1
Iteration: 8  Accuracy Score:  0.4
Iteration: 9  Accuracy Score:  0.3
Iteration: 10  Accuracy Score:  0.4


In [67]:
# Word2Vec
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Classifier
decisionTreeClas = DecisionTreeClassifier()
TenFoldValidation_Word2Vec(summarisationData, decisionTreeClas)

Iteration: 1  Accuracy Score:  0.5
Iteration: 2  Accuracy Score:  0.1
Iteration: 3  Accuracy Score:  0.4
Iteration: 4  Accuracy Score:  0.2
Iteration: 5  Accuracy Score:  0.5
Iteration: 6  Accuracy Score:  0.4
Iteration: 7  Accuracy Score:  0.2
Iteration: 8  Accuracy Score:  0.3
Iteration: 9  Accuracy Score:  0.3
Iteration: 10  Accuracy Score:  0.3


In [47]:
from sklearn.naive_bayes import GaussianNB

# Naive Bayes Classifier
gaussianNBClas = GaussianNB()
TenFoldValidation_BowTfIdf(summarisationData, gaussianNBClas)

Iteration: 1  Accuracy Score:  0.3
Iteration: 2  Accuracy Score:  0.5
Iteration: 3  Accuracy Score:  0.2
Iteration: 4  Accuracy Score:  0.3
Iteration: 5  Accuracy Score:  0.2
Iteration: 6  Accuracy Score:  0.4
Iteration: 7  Accuracy Score:  0.3
Iteration: 8  Accuracy Score:  0.4
Iteration: 9  Accuracy Score:  0.2
Iteration: 10  Accuracy Score:  0.2


In [48]:
from sklearn.naive_bayes import MultinomialNB
# Multinomial Naive Bayes Classifier
multiNBClas = MultinomialNB()
TenFoldValidation_BowTfIdf(summarisationData, multiNBClas)

Iteration: 1  Accuracy Score:  0.3
Iteration: 2  Accuracy Score:  0.4
Iteration: 3  Accuracy Score:  0.2
Iteration: 4  Accuracy Score:  0.2
Iteration: 5  Accuracy Score:  0.2
Iteration: 6  Accuracy Score:  0.4
Iteration: 7  Accuracy Score:  0.1
Iteration: 8  Accuracy Score:  0.4
Iteration: 9  Accuracy Score:  0.2
Iteration: 10  Accuracy Score:  0.1


In [49]:
from sklearn.linear_model import SGDClassifier
# SGD Classifier
sgdClas = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)
TenFoldValidation_BowTfIdf(summarisationData, sgdClas)

Iteration: 1  Accuracy Score:  0.3
Iteration: 2  Accuracy Score:  0.5
Iteration: 3  Accuracy Score:  0.2
Iteration: 4  Accuracy Score:  0.2
Iteration: 5  Accuracy Score:  0.3
Iteration: 6  Accuracy Score:  0.3
Iteration: 7  Accuracy Score:  0.2
Iteration: 8  Accuracy Score:  0.5
Iteration: 9  Accuracy Score:  0.5
Iteration: 10  Accuracy Score:  0.3


In [68]:
# Word2Vec
from sklearn.linear_model import SGDClassifier
# SGD Classifier
sgdClas = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)
TenFoldValidation_Word2Vec(summarisationData, sgdClas)

Iteration: 1  Accuracy Score:  0.4
Iteration: 2  Accuracy Score:  0.3
Iteration: 3  Accuracy Score:  0.3
Iteration: 4  Accuracy Score:  0.4
Iteration: 5  Accuracy Score:  0.3
Iteration: 6  Accuracy Score:  0.1
Iteration: 7  Accuracy Score:  0.3
Iteration: 8  Accuracy Score:  0.3
Iteration: 9  Accuracy Score:  0.1
Iteration: 10  Accuracy Score:  0.3


In [50]:
from sklearn.linear_model import LogisticRegression
# Logistic Regression
logRegClas = LogisticRegression()
TenFoldValidation_BowTfIdf(summarisationData, logRegClas)

Iteration: 1  Accuracy Score:  0.3
Iteration: 2  Accuracy Score:  0.6
Iteration: 3  Accuracy Score:  0.2
Iteration: 4  Accuracy Score:  0.2
Iteration: 5  Accuracy Score:  0.4
Iteration: 6  Accuracy Score:  0.3
Iteration: 7  Accuracy Score:  0.1
Iteration: 8  Accuracy Score:  0.4
Iteration: 9  Accuracy Score:  0.2
Iteration: 10  Accuracy Score:  0.1


In [69]:
# Word2Vec
from sklearn.linear_model import LogisticRegression
# Logistic Regression
logRegClas = LogisticRegression()
TenFoldValidation_Word2Vec(summarisationData, logRegClas)

Iteration: 1  Accuracy Score:  0.2
Iteration: 2  Accuracy Score:  0.3
Iteration: 3  Accuracy Score:  0.3
Iteration: 4  Accuracy Score:  0.2
Iteration: 5  Accuracy Score:  0.3
Iteration: 6  Accuracy Score:  0.1
Iteration: 7  Accuracy Score:  0.3
Iteration: 8  Accuracy Score:  0.3
Iteration: 9  Accuracy Score:  0.1
Iteration: 10  Accuracy Score:  0.3
