In [141]:
import os
import json
import time
import random

import multiprocessing as mp


import io
import os.path
import re
import tarfile
import sys
import gensim
import pandas as pd
import numpy as np

from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords


from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from nltk import pos_tag


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm, naive_bayes, neural_network
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

from nltk.tokenize import word_tokenize


In [2]:
repos_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))

gutenberg_corpus_analysis_repo = os.path.join(repos_path, 'gutenberg_corpus_analysis')

In [3]:
gutenberg_repo_path = os.path.join(repos_path, 'gutenberg')
gutenberg_analysis_repo = os.path.join(repos_path, 'gutenberg-analysis')

src_dir = os.path.join(gutenberg_analysis_repo,'src')
sys.path.append(src_dir)
from data_io import get_book


gutenberg_src_dir = os.path.join(gutenberg_repo_path,'src')
sys.path.append(gutenberg_src_dir)

from metaquery import meta_query

sys.path.append(gutenberg_corpus_analysis_repo)
import misc_utils.dataset_filtering as dataset_filtering

In [5]:
dataset='nikita_dataset'

In [6]:
train_csv = os.path.join(gutenberg_corpus_analysis_repo, dataset, 'final_train.csv')
test_csv = os.path.join(gutenberg_corpus_analysis_repo, dataset, 'final_test.csv')
val_csv = os.path.join(gutenberg_corpus_analysis_repo, dataset, 'final_val.csv')

pg_catalog_filepath=os.path.join(gutenberg_repo_path, 'metadata', 'pg_catalog.csv')

In [7]:
# Be able to reproduce results (won't work on chunking)
np.random.seed(500)

In [8]:
train_df = pd.read_csv(train_csv, index_col='Unnamed: 0')
test_df = pd.read_csv(test_csv, index_col='Unnamed: 0')
val_df = pd.read_csv(val_csv, index_col='Unnamed: 0')

train_df.head()

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects
2439,PG12810,"Uncle Sam's Boys with Pershing's Troops: Or, D...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],78,"{'World War, 1914-1918 -- Juvenile fiction', '..."
2446,PG12819,"Dick Prescott's Second Year at West Point: Or,...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],94,{'United States Military Academy -- Juvenile f...
25920,PG40605,"The Motor Boat Club at Nantucket; or, The Myst...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],189,"{'Motorboats -- Juvenile fiction', 'Nantucket ..."
55435,PG8153,"The Young Engineers in Arizona; or, Laying Tra...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],190,"{'Civil engineers -- Fiction', 'Arizona -- Fic..."
32899,PG48863,"The Motor Boat Club off Long Island; or, A Dar...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],85,"{'Motorboats -- Juvenile fiction', 'Long Islan..."


In [9]:
len(train_df['author'].unique())

80

In [10]:
subj = train_df['subjects'].replace('set()',np.nan)
subj_docs = []
for h in subj:
    try:
        h = h.strip("{}")[1:-1]
    except AttributeError:
        subj_docs.append(h)
        continue
    h = h.replace(' -- ', '-')
    h = h.replace("', '","_")
    h = h.split('_')
    h = [item.replace(' ','').replace(',', ' ') for item in h]
    h = ' '.join(h)
    subj_docs.append(h)

In [11]:
train_df['subj_str']=subj_docs

In [12]:
#train_df['subject_str'] = train_df['subjects'].apply(lambda x: split_subjects(x))

In [13]:
#train_df = train_df.sample(500)

In [14]:
start = time.time()

train_df['text'] = train_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))
test_df['text'] = test_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))
val_df['text'] = val_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))

end = time.time()
print(f'Time elapsed: {end-start} seconds')

Time elapsed: 5.267264366149902 seconds


In [15]:
def skip_start_and_end(text):
    text = text.split(' ')
    text = text[50:-50]
    return ' '.join(text)

train_df['text'] = train_df['text'].apply(skip_start_and_end)
test_df['text'] = test_df['text'].apply(skip_start_and_end)
val_df['text'] = val_df['text'].apply(skip_start_and_end)


In [16]:
def make_random_chunks(text, num_chunks=10, chunk_size=1000, overlap=False):
    chunk = []
    words = text.split(' ')

    if num_chunks * chunk_size > len(words):
        return text
    for i in range(num_chunks):
        new_words = []
        num_words = len(words)
        if chunk_size > num_words:
            chunk = chunk + words
            words = []
            return ' '.join(chunk)
        start = random.randint(0, num_words)
        chunk = [*chunk,  *words[start:start+chunk_size]]
        #print(chunk)
        if start == 0:
            words = words[chunk_size:]
        elif start == num_words - chunk_size:
            words = words[0:start]
        else:
            words = words[0:start] + words[start+chunk_size:]
    return ' '.join(chunk)

In [17]:
train_df['chunks'] = train_df['text'].apply(lambda x: make_random_chunks(x, num_chunks=10, chunk_size = 1000, overlap=False))
test_df['chunks'] = test_df['text'].apply(lambda x: make_random_chunks(x, num_chunks=10, chunk_size = 1000, overlap=False))
val_df['chunks'] = val_df['text'].apply(lambda x: make_random_chunks(x, num_chunks=10, chunk_size = 1000, overlap=False))

In [23]:
start = time.time()
with mp.Pool(11) as pool:
    train_df['tokenized'] = pool.map(word_tokenize, train_df['chunks'])
end = time.time()
print(f'Took {end-start} seconds')


Took 12.728698253631592 seconds


In [24]:
import multiprocessing as mp
start = time.time()
with mp.Pool(11) as pool:
    val_df['tokenized'] = pool.map(word_tokenize, val_df['chunks'])
end = time.time()
print(f'Took {end-start} seconds')



Took 2.776623249053955 seconds


In [25]:
import multiprocessing as mp
start = time.time()
with mp.Pool(11) as pool:
    test_df['tokenized'] = pool.map(word_tokenize, test_df['chunks'])
end = time.time()
print(f'Took {end-start} seconds')



Took 3.5133683681488037 seconds


In [26]:
outfile=os.path.join(gutenberg_corpus_analysis_repo, 'tokenized', 'train_df_chunks_tokenized.pkl')
train_df.to_pickle(outfile)

outfile=os.path.join(gutenberg_corpus_analysis_repo, 'tokenized', 'val_df_chunks_tokenized.pkl')
val_df.to_pickle(outfile)

outfile=os.path.join(gutenberg_corpus_analysis_repo, 'tokenized', 'test_df_chunks_tokenized.pkl')
test_df.to_pickle(outfile)



In [None]:
train_df = pd.read_pickle(outfile)

In [31]:
train_df[train_df['tokenized'].isnull()]

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,subj_str,text,chunks,tokenized


In [32]:
test_df[test_df['tokenized'].isnull()]

Unnamed: 0_level_0,Unnamed: 0.1,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,text,chunks,tokenized
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1


In [33]:
val_df[val_df['tokenized'].isnull()]

Unnamed: 0_level_0,Unnamed: 0.1,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,text,chunks,tokenized
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1


In [None]:
train_df['author'].unique()

In [30]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV                

In [39]:
def lemmatize_text(tokenized_text):
    # Declaring Empty List to store the words that follow the rules for this step
    final_words = []
    # Initializing WordNetLemmatizer()
    word_lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(tokenized_text):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_lemmatized.lemmatize(word,tag_map[tag[0]])
            final_words.append(word_final)
    return str(final_words)
    # The final processed set of words for each iteration will be stored in 'text_final'

In [41]:
start = time.time()
with mp.Pool(11) as pool:
    lemmatized = pool.map(lemmatize_text, train_df['tokenized'])
end = time.time()
print(f'Took {end-start} seconds')
train_df['lemmatized'] = lemmatized

Took 265.2570126056671 seconds


In [42]:
import multiprocessing as mp
start = time.time()
with mp.Pool(11) as pool:
    lemmatized = pool.map(lemmatize_text, val_df['tokenized'])
end = time.time()
print(f'Took {end-start} seconds')
val_df['lemmatized'] = lemmatized

Took 34.088324546813965 seconds


In [43]:
import multiprocessing as mp
start = time.time()
with mp.Pool(11) as pool:
    lemmatized = pool.map(lemmatize_text, test_df['tokenized'])
end = time.time()
print(f'Took {end-start} seconds')
test_df['lemmatized'] = lemmatized

Took 34.880149364471436 seconds


In [None]:
train_df.shape

In [47]:
Train_X, Train_Y = train_df['lemmatized'], train_df['author']
Test_X, Test_Y = test_df['lemmatized'], test_df['author']
Val_X, Val_Y = val_df['lemmatized'], val_df['author']

In [None]:
# Train_X, _, Train_Y, _ = model_selection.train_test_split(train_df['lemmatized'], train_df['author'],test_size=0.3)
# Val_X, _, Val_Y, _ = model_selection.train_test_split(val_df['lemmatized'], val_df['author'],test_size=0.3)
# Test_X, _, Test_Y, _ = model_selection.train_test_split(test_df['lemmatized'], test_df['author'],test_size=0.3)


In [49]:
Encoder = LabelEncoder()
Train_Y_e = Encoder.fit_transform(Train_Y)
Test_Y_e = Encoder.fit_transform(Test_Y)
Val_Y_e = Encoder.fit_transform(Val_Y)


In [52]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(train_df['lemmatized'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
Val_X_Tfidf = Tfidf_vect.transform(Val_X)


# Support Vector Machines

In [98]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC()
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset

predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_train = SVM.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_val = SVM.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("SVM Accuracy Score: Training Data -> ",accuracy_score(predictions_SVM_train, Train_Y)*100)
print("SVM Accuracy Score: Validation Data -> ",accuracy_score(predictions_SVM_val, Val_Y)*100)
print("SVM Accuracy Score: Test Data -> ",accuracy_score(predictions_SVM, Test_Y)*100)



#train_class_rpt = classification_report(Train_Y, predictions_SVM_train)
#test_class_rpt = classification_report(Val_Y, predictions_SVM_val)


SVM Accuracy Score: Training Data ->  99.53125
SVM Accuracy Score: Validation Data ->  81.66666666666667
SVM Accuracy Score: Test Data ->  82.08333333333333


In [99]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset

predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_train = SVM.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_val = SVM.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("SVM Accuracy Score: Training Data -> ",accuracy_score(predictions_SVM_train, Train_Y)*100)
print("SVM Accuracy Score: Validation Data -> ",accuracy_score(predictions_SVM_val, Val_Y)*100)
print("SVM Accuracy Score: Test Data -> ",accuracy_score(predictions_SVM, Test_Y)*100)


SVM Accuracy Score: Training Data ->  98.33333333333333
SVM Accuracy Score: Validation Data ->  85.41666666666666
SVM Accuracy Score: Test Data ->  85.0


In [100]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='scale')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset

predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_train = SVM.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_val = SVM.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("SVM Accuracy Score: Training Data -> ",accuracy_score(predictions_SVM_train, Train_Y)*100)
print("SVM Accuracy Score: Validation Data -> ",accuracy_score(predictions_SVM_val, Val_Y)*100)
print("SVM Accuracy Score: Test Data -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score: Training Data ->  98.33333333333333
SVM Accuracy Score: Validation Data ->  85.41666666666666
SVM Accuracy Score: Test Data ->  85.0


In [103]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_train = SVM.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_val = SVM.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("SVM Accuracy Score: Training Data -> ",accuracy_score(predictions_SVM_train, Train_Y)*100)
print("SVM Accuracy Score: Validation Data -> ",accuracy_score(predictions_SVM_val, Val_Y)*100)
print("SVM Accuracy Score: Test Data -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score: Training Data ->  86.77083333333333
SVM Accuracy Score: Validation Data ->  73.75
SVM Accuracy Score: Test Data ->  68.33333333333333


In [104]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='scale')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_train = SVM.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_val = SVM.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("SVM Accuracy Score: Training Data -> ",accuracy_score(predictions_SVM_train, Train_Y)*100)
print("SVM Accuracy Score: Validation Data -> ",accuracy_score(predictions_SVM_val, Val_Y)*100)
print("SVM Accuracy Score: Test Data -> ",accuracy_score(predictions_SVM, Test_Y)*100)


SVM Accuracy Score: Training Data ->  99.53125
SVM Accuracy Score: Validation Data ->  81.66666666666667
SVM Accuracy Score: Test Data ->  82.08333333333333


In [70]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='poly', degree=4, gamma='scale')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset

predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_train = SVM.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_val = SVM.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("SVM Accuracy Score: Training Data -> ",accuracy_score(predictions_SVM_train, Train_Y)*100)
print("SVM Accuracy Score: Validation Data -> ",accuracy_score(predictions_SVM_val, Val_Y)*100)
print("SVM Accuracy Score: Test Data -> ",accuracy_score(predictions_SVM, Test_Y)*100)


SVM Accuracy Score: Training Data ->  99.94791666666667
SVM Accuracy Score: Test Data ->  72.5
SVM Accuracy Score: Validation Data ->  70.41666666666667


In [105]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='poly', degree=2, gamma='scale')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset

predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_train = SVM.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_val = SVM.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("SVM Accuracy Score: Training Data -> ",accuracy_score(predictions_SVM_train, Train_Y)*100)
print("SVM Accuracy Score: Validation Data -> ",accuracy_score(predictions_SVM_val, Val_Y)*100)
print("SVM Accuracy Score: Test Data -> ",accuracy_score(predictions_SVM, Test_Y)*100)


SVM Accuracy Score: Training Data ->  99.6875
SVM Accuracy Score: Validation Data ->  82.08333333333333
SVM Accuracy Score: Test Data ->  81.66666666666667


In [106]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='poly', degree=3, gamma='scale')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset

predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_train = SVM.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_val = SVM.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("SVM Accuracy Score: Training Data -> ",accuracy_score(predictions_SVM_train, Train_Y)*100)
print("SVM Accuracy Score: Validation Data -> ",accuracy_score(predictions_SVM_val, Val_Y)*100)
print("SVM Accuracy Score: Test Data -> ",accuracy_score(predictions_SVM, Test_Y)*100)


SVM Accuracy Score: Training Data ->  99.94791666666667
SVM Accuracy Score: Validation Data ->  76.25
SVM Accuracy Score: Test Data ->  75.41666666666667


In [107]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='poly', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_train = SVM.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_val = SVM.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("SVM Accuracy Score: Training Data -> ",accuracy_score(predictions_SVM_train, Train_Y)*100)
print("SVM Accuracy Score: Validation Data -> ",accuracy_score(predictions_SVM_val, Val_Y)*100)
print("SVM Accuracy Score: Test Data -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score: Training Data ->  93.59375
SVM Accuracy Score: Validation Data ->  64.16666666666667
SVM Accuracy Score: Test Data ->  61.25000000000001


In [108]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='sigmoid', degree=3, gamma='scale')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_train = SVM.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_val = SVM.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("SVM Accuracy Score: Training Data -> ",accuracy_score(predictions_SVM_train, Train_Y)*100)
print("SVM Accuracy Score: Validation Data -> ",accuracy_score(predictions_SVM_val, Val_Y)*100)
print("SVM Accuracy Score: Test Data -> ",accuracy_score(predictions_SVM, Test_Y)*100)


SVM Accuracy Score: Training Data ->  96.5625
SVM Accuracy Score: Validation Data ->  85.0
SVM Accuracy Score: Test Data ->  82.5


In [109]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='sigmoid', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_train = SVM.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_SVM_val = SVM.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("SVM Accuracy Score: Training Data -> ",accuracy_score(predictions_SVM_train, Train_Y)*100)
print("SVM Accuracy Score: Validation Data -> ",accuracy_score(predictions_SVM_val, Val_Y)*100)
print("SVM Accuracy Score: Test Data -> ",accuracy_score(predictions_SVM, Test_Y)*100)


SVM Accuracy Score: Training Data ->  86.77083333333333
SVM Accuracy Score: Validation Data ->  73.75
SVM Accuracy Score: Test Data ->  68.33333333333333


In [117]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
LSVC = svm.LinearSVC()
LSVC.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_LSVC = LSVC.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_LSVC_train = LSVC.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_LSVC_val = LSVC.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("LSVC Accuracy Score: Training Data -> ",accuracy_score(predictions_LSVC_train, Train_Y)*100)
print("LSVC Accuracy Score: Validation Data -> ",accuracy_score(predictions_LSVC_val, Val_Y)*100)
print("LSVC Accuracy Score: Test Data -> ",accuracy_score(predictions_LSVC, Test_Y)*100)


LSVC Accuracy Score: Training Data ->  100.0
LSVC Accuracy Score: Validation Data ->  91.66666666666666
LSVC Accuracy Score: Test Data ->  89.58333333333334


In [120]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
LSVC = svm.LinearSVC(C=10)
LSVC.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_LSVC = LSVC.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_LSVC_train = LSVC.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_LSVC_val = LSVC.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("LSVC Accuracy Score: Training Data -> ",accuracy_score(predictions_LSVC_train, Train_Y)*100)
print("LSVC Accuracy Score: Validation Data -> ",accuracy_score(predictions_LSVC_val, Val_Y)*100)
print("LSVC Accuracy Score: Test Data -> ",accuracy_score(predictions_LSVC, Test_Y)*100)


LSVC Accuracy Score: Training Data ->  100.0
LSVC Accuracy Score: Validation Data ->  92.5
LSVC Accuracy Score: Test Data ->  91.66666666666666


In [123]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
LSVC = svm.LinearSVC(C=10, multi_class='crammer_singer', max_iter=5000)
LSVC.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_LSVC = LSVC.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_LSVC_train = LSVC.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_LSVC_val = LSVC.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("LSVC Accuracy Score: Training Data -> ",accuracy_score(predictions_LSVC_train, Train_Y)*100)
print("LSVC Accuracy Score: Validation Data -> ",accuracy_score(predictions_LSVC_val, Val_Y)*100)
print("LSVC Accuracy Score: Test Data -> ",accuracy_score(predictions_LSVC, Test_Y)*100)


LSVC Accuracy Score: Training Data ->  100.0
LSVC Accuracy Score: Validation Data ->  92.08333333333333
LSVC Accuracy Score: Test Data ->  91.25




In [126]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
NuSVC_model = svm.NuSVC()
NuSVC_model.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
predictions_NuSVC = NuSVC_model.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_NuSVC_train = NuSVC_model.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_NuSVC_val = NuSVC_model.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy


print("NuSVC Accuracy Score: Training Data -> ",accuracy_score(predictions_NuSVC_train, Train_Y)*100)
print("NuSVC Accuracy Score: Validation Data -> ",accuracy_score(predictions_NuSVC_val, Val_Y)*100)
print("NuSVC Accuracy Score: Test Data -> ",accuracy_score(predictions_NuSVC, Test_Y)*100)


NuSVC Accuracy Score: Training Data ->  99.47916666666666
NuSVC Accuracy Score: Validation Data ->  83.75
NuSVC Accuracy Score: Test Data ->  82.5


# Naive Bayes

In [152]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset

predictions_NB_trainData = Naive.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_NB = Naive.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_NB_val = Naive.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy

print("Naive Bayes Accuracy Score : Training Daata-> ",accuracy_score(predictions_NB_trainData, Train_Y)*100)
print("Naive Bayes Accuracy Score : Validation Data-> ",accuracy_score(predictions_NB_val, Val_Y)*100)
print("Naive Bayes Accuracy Score : Test Data-> ",accuracy_score(predictions_NB, Test_Y)*100)



Naive Bayes Accuracy Score : Training Daata->  90.05208333333333
Naive Bayes Accuracy Score : Validation Data->  67.08333333333333
Naive Bayes Accuracy Score : Test Data->  70.41666666666667


# Neural Networks

## Multi Layer Perceptron

In [151]:
MLP = neural_network.MLPClassifier()
MLP.fit(Train_X_Tfidf,Train_Y)
MLP.predict_proba(Val_X_Tfidf)

array([[1.11137537e-04, 6.23465007e-06, 3.47699031e-05, ...,
        3.18122873e-06, 7.78901107e-06, 2.59659859e-05],
       [4.95418578e-05, 4.20379544e-06, 8.88819664e-06, ...,
        1.70205114e-06, 3.86623892e-06, 1.49457982e-05],
       [7.27488943e-06, 4.42181578e-07, 9.78729249e-06, ...,
        5.80907526e-07, 3.55594515e-07, 5.65996427e-06],
       ...,
       [6.25602187e-03, 4.06119969e-04, 3.12391001e-06, ...,
        6.60446844e-06, 9.35429358e-04, 5.38171368e-05],
       [8.55918370e-04, 1.77732301e-03, 9.69034078e-06, ...,
        2.49627144e-04, 4.60137924e-04, 3.96444624e-04],
       [3.89523492e-03, 7.57347706e-05, 2.89311624e-07, ...,
        1.86764567e-04, 1.27687691e-04, 7.49962118e-04]])

In [158]:
predictions_MLP = MLP.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_MLP_train = MLP.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_MLP_val = MLP.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy

print("MLP Accuracy Score: Training Data -> ",accuracy_score(predictions_MLP_train, Train_Y)*100)
print("MLP Accuracy Score: Validation Data -> ",accuracy_score(predictions_MLP_val, Val_Y)*100)
print("MLP Accuracy Score: Test Data -> ",accuracy_score(predictions_MLP, Test_Y)*100)


MLP Accuracy Score: Training Data ->  100.0
MLP Accuracy Score: Validation Data ->  90.41666666666667
MLP Accuracy Score: Test Data ->  90.83333333333333


In [160]:
MLP = neural_network.MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000)
MLP.fit(Train_X_Tfidf,Train_Y)

predictions_MLP = MLP.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_MLP_train = MLP.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_MLP_val = MLP.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy

print("MLP Accuracy Score: Training Data -> ",accuracy_score(predictions_MLP_train, Train_Y)*100)
print("MLP Accuracy Score: Validation Data -> ",accuracy_score(predictions_MLP_val, Val_Y)*100)
print("MLP Accuracy Score: Test Data -> ",accuracy_score(predictions_MLP, Test_Y)*100)


MLP Accuracy Score: Training Data ->  100.0
MLP Accuracy Score: Validation Data ->  92.5
MLP Accuracy Score: Test Data ->  91.25


In [161]:
MLP = neural_network.MLPClassifier(activation='logistic', max_iter=500)
MLP.fit(Train_X_Tfidf,Train_Y)

predictions_MLP = MLP.predict(Test_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_MLP_train = MLP.predict(Train_X_Tfidf)# Use accuracy_score function to get the accuracy
predictions_MLP_val = MLP.predict(Val_X_Tfidf)# Use accuracy_score function to get the accuracy

print("MLP Accuracy Score: Training Data -> ",accuracy_score(predictions_MLP_train, Train_Y)*100)
print("MLP Accuracy Score: Validation Data -> ",accuracy_score(predictions_MLP_val, Val_Y)*100)
print("MLP Accuracy Score: Test Data -> ",accuracy_score(predictions_MLP, Test_Y)*100)


MLP Accuracy Score: Training Data ->  100.0
MLP Accuracy Score: Validation Data ->  91.25
MLP Accuracy Score: Test Data ->  90.83333333333333
