In [1]:
import os
import json
import time
import random

import multiprocessing as mp


import io
import os.path
import re
import tarfile
import sys
import gensim
import pandas as pd
import numpy as np

from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords


from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from nltk import pos_tag


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm, naive_bayes, neural_network
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, classification_report, make_scorer)

from nltk.tokenize import word_tokenize


In [3]:
repos_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

gutenberg_corpus_analysis_repo = os.path.join(repos_path, 'gutenberg_corpus_analysis')

In [5]:
gutenberg_repo_path = os.path.join(repos_path, 'gutenberg')
gutenberg_analysis_repo = os.path.join(repos_path, 'gutenberg-analysis')

src_dir = os.path.join(gutenberg_analysis_repo,'src')
sys.path.append(src_dir)
from data_io import get_book


gutenberg_src_dir = os.path.join(gutenberg_repo_path,'src')
sys.path.append(gutenberg_src_dir)

from metaquery import meta_query

sys.path.append(gutenberg_corpus_analysis_repo)
import misc_utils.dataset_filtering as dataset_filtering

In [9]:
dataset='nikita_dataset'

In [11]:
train_csv = os.path.join(gutenberg_corpus_analysis_repo, dataset, 'final_train.csv')
test_csv = os.path.join(gutenberg_corpus_analysis_repo, dataset, 'final_test.csv')
val_csv = os.path.join(gutenberg_corpus_analysis_repo, dataset, 'final_val.csv')

pg_catalog_filepath=os.path.join(gutenberg_repo_path, 'metadata', 'pg_catalog.csv')

In [14]:
train_df = pd.read_csv(train_csv, index_col='Unnamed: 0')
test_df = pd.read_csv(test_csv, index_col='Unnamed: 0')
val_df = pd.read_csv(val_csv, index_col='Unnamed: 0')

train_df.head()

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects
2439,PG12810,"Uncle Sam's Boys with Pershing's Troops: Or, D...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],78,"{'World War, 1914-1918 -- Juvenile fiction', '..."
2446,PG12819,"Dick Prescott's Second Year at West Point: Or,...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],94,{'United States Military Academy -- Juvenile f...
25920,PG40605,"The Motor Boat Club at Nantucket; or, The Myst...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],189,"{'Motorboats -- Juvenile fiction', 'Nantucket ..."
55435,PG8153,"The Young Engineers in Arizona; or, Laying Tra...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],190,"{'Civil engineers -- Fiction', 'Arizona -- Fic..."
32899,PG48863,"The Motor Boat Club off Long Island; or, A Dar...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],85,"{'Motorboats -- Juvenile fiction', 'Long Islan..."


In [16]:
len(train_df['author'].unique())

80

In [18]:
subj = train_df['subjects'].replace('set()',np.nan)
subj_docs = []
for h in subj:
    try:
        h = h.strip("{}")[1:-1]
    except AttributeError:
        subj_docs.append(h)
        continue
    h = h.replace(' -- ', '-')
    h = h.replace("', '","_")
    h = h.split('_')
    h = [item.replace(' ','').replace(',', ' ') for item in h]
    h = ' '.join(h)
    subj_docs.append(h)

In [20]:
train_df['subj_str']=subj_docs

In [22]:
#train_df['subject_str'] = train_df['subjects'].apply(lambda x: split_subjects(x))

In [24]:
#train_df = train_df.sample(500)

In [26]:
start = time.time()

train_df['text'] = train_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))
test_df['text'] = test_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))
val_df['text'] = val_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))

end = time.time()
print(f'Time elapsed: {end-start} seconds')

Time elapsed: 5.520370244979858 seconds


In [73]:
# Define a function to apply the word, line and token counts
def enrich_dataframe(df):
    count_path = os.path.join(gutenberg_repo_path, 'data', 'counts')
    text_path = os.path.join(gutenberg_repo_path, 'data', 'text')
    token_path = os.path.join(gutenberg_repo_path, 'data', 'tokens')

    df['word_count'] = df['id'].apply(lambda pid: dataset_filtering.get_word_count(pid, count_path))
    df['unique_word_count'] = df['id'].apply(lambda pid: dataset_filtering.get_unique_word_count(pid, count_path))
    df['line_count'] = df['id'].apply(lambda pid: dataset_filtering.get_line_count(pid, text_path))
    df['token_count'] = df['id'].apply(lambda pid: dataset_filtering.get_token_count(pid, token_path))

    return df

In [74]:
train_df = enrich_dataframe(train_df)
val_df = enrich_dataframe(val_df)
test_df = enrich_dataframe(test_df)

In [27]:
def skip_start_and_end(text):
    text = text.split(' ')
    text = text[50:-50]
    return ' '.join(text)

train_df['text'] = train_df['text'].apply(skip_start_and_end)
test_df['text'] = test_df['text'].apply(skip_start_and_end)
val_df['text'] = val_df['text'].apply(skip_start_and_end)


In [28]:
def make_random_chunks(text, num_chunks=10, chunk_size=1000, overlap=False):
    chunk = []
    words = text.split(' ')

    if num_chunks * chunk_size > len(words):
        return text
    for i in range(num_chunks):
        new_words = []
        num_words = len(words)
        if chunk_size > num_words:
            chunk = chunk + words
            words = []
            return ' '.join(chunk)
        start = random.randint(0, num_words)
        chunk = [*chunk,  *words[start:start+chunk_size]]
        #print(chunk)
        if start == 0:
            words = words[chunk_size:]
        elif start == num_words - chunk_size:
            words = words[0:start]
        else:
            words = words[0:start] + words[start+chunk_size:]
    return ' '.join(chunk)

In [48]:
train_df['chunks'] = train_df['text'].apply(lambda x: make_random_chunks(x, num_chunks=10, chunk_size = 500, overlap=False))
test_df['chunks'] = test_df['text'].apply(lambda x: make_random_chunks(x, num_chunks=10, chunk_size = 500, overlap=False))
val_df['chunks'] = val_df['text'].apply(lambda x: make_random_chunks(x, num_chunks=10, chunk_size = 500, overlap=False))

In [49]:
start = time.time()
with mp.Pool(11) as pool:
    train_df['tokenized'] = pool.map(word_tokenize, train_df['chunks'])
end = time.time()
print(f'Took {end-start} seconds')


Took 7.0299530029296875 seconds


In [50]:
start = time.time()
with mp.Pool(11) as pool:
    val_df['tokenized'] = pool.map(word_tokenize, val_df['chunks'])
end = time.time()
print(f'Took {end-start} seconds')



Took 2.0713865756988525 seconds


In [51]:
start = time.time()
with mp.Pool(11) as pool:
    test_df['tokenized'] = pool.map(word_tokenize, test_df['chunks'])
end = time.time()
print(f'Took {end-start} seconds')



Took 2.064713716506958 seconds


In [26]:
outfile=os.path.join(gutenberg_corpus_analysis_repo, 'tokenized', 'train_df_chunks_tokenized.pkl')
train_df.to_pickle(outfile)

outfile=os.path.join(gutenberg_corpus_analysis_repo, 'tokenized', 'val_df_chunks_tokenized.pkl')
val_df.to_pickle(outfile)

outfile=os.path.join(gutenberg_corpus_analysis_repo, 'tokenized', 'test_df_chunks_tokenized.pkl')
test_df.to_pickle(outfile)



In [None]:
train_df = pd.read_pickle(outfile)

In [31]:
train_df[train_df['tokenized'].isnull()]

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,subj_str,text,chunks,tokenized


In [32]:
test_df[test_df['tokenized'].isnull()]

Unnamed: 0_level_0,Unnamed: 0.1,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,text,chunks,tokenized
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1


In [33]:
val_df[val_df['tokenized'].isnull()]

Unnamed: 0_level_0,Unnamed: 0.1,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,text,chunks,tokenized
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1


In [None]:
train_df['author'].unique()

In [36]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV                

In [38]:
def lemmatize_text(tokenized_text):
    # Declaring Empty List to store the words that follow the rules for this step
    final_words = []
    # Initializing WordNetLemmatizer()
    word_lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(tokenized_text):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_lemmatized.lemmatize(word,tag_map[tag[0]])
            final_words.append(word_final)
    return str(final_words)
    # The final processed set of words for each iteration will be stored in 'text_final'

In [None]:
start = time.time()
with mp.Pool(11) as pool:
    lemmatized = pool.map(lemmatize_text, train_df['tokenized'])
end = time.time()
print(f'Took {end-start} seconds')
train_df['lemmatized'] = lemmatized

In [None]:
import multiprocessing as mp
start = time.time()
with mp.Pool(11) as pool:
    lemmatized = pool.map(lemmatize_text, val_df['tokenized'])
end = time.time()
print(f'Took {end-start} seconds')
val_df['lemmatized'] = lemmatized

In [None]:
import multiprocessing as mp
start = time.time()
with mp.Pool(11) as pool:
    lemmatized = pool.map(lemmatize_text, test_df['tokenized'])
end = time.time()
print(f'Took {end-start} seconds')
test_df['lemmatized'] = lemmatized

In [None]:
train_df.drop('text', axis=1, inplace=True)


val_df.drop('text', axis=1, inplace=True)


test_df.drop('text', axis=1, inplace=True)



In [67]:
outfile=os.path.join(gutenberg_corpus_analysis_repo, 'tokenized', 'train_df_10chunks1000_Lemmatized.pkl')
train_df=pd.read_pickle(outfile)
#train_df.to_pickle(outfile)

outfile=os.path.join(gutenberg_corpus_analysis_repo, 'tokenized', 'val_df_10chunks1000_Lemmatized.pkl')
val_df= pd.read_pickle(outfile)
#val_df.to_pickle(outfile)

outfile=os.path.join(gutenberg_corpus_analysis_repo, 'tokenized', 'test_df_10chunks1000_Lemmatized.pkl')
test_df  = pd.read_pickle(outfile)
#test_df.to_pickle(outfile)



In [68]:
Train_X, Train_Y = train_df['lemmatized'], train_df['author']
Test_X, Test_Y = test_df['lemmatized'], test_df['author']
Val_X, Val_Y = val_df['lemmatized'], val_df['author']

In [49]:
# Encoder = LabelEncoder()
# Train_Y_e = Encoder.fit_transform(Train_Y)
# Test_Y_e = Encoder.fit_transform(Test_Y)
# Val_Y_e = Encoder.fit_transform(Val_Y)


In [71]:
Tfidf_vect = TfidfVectorizer(max_features=15000)
Tfidf_vect.fit(train_df['lemmatized'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
Val_X_Tfidf = Tfidf_vect.transform(Val_X)


In [72]:
def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test, res_file, model_description, preproc_desc):
    # Train and predict
    start = time.time()
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    end = time.time()

    print(f'Training and predicting took {end-start} seconds = {(end-start)/60} minutes')

    results={}
    for label, y_truth, y_pred in [('train', y_train, y_train_pred), 
                            ('validation', y_val, y_val_pred),
                            ('test', y_test, y_test_pred)]:
        # Metrics (set zero_division=0 to silence warnings)
        acc = accuracy_score(y_truth, y_pred)
        f1 = f1_score(y_truth, y_pred, average='weighted', zero_division=0)
        precision = precision_score(y_truth, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_truth, y_pred, average='weighted', zero_division=0)

        result_dict = {'accuracy': acc,
                       'precision': precision,
                       'recall' : recall,
                       'f1' : f1}
        results[label] = result_dict
        
        
    # Print performance
    print(f"Model: {model.__class__.__name__}")
    label_str=f'|{'':<15} ||  {'Accuracy':>15} | {'Precision':>15} | {'Recall':>15} | {'F1-Score':>15} |'
    print("-" * len(label_str))

    print(label_str)
    print("-" * len(label_str))

    for result_label, sub_res_dict in results.items():
        output_str = f'|{result_label:<15} || '
        
        for key, val in sub_res_dict.items():
            output_str += f' {val:15.4f} |'
        print(output_str)

    print("-" * len(label_str))

    new_res_df = results_to_df(model.__class__.__name__, model_description, preproc_desc, results)
    
    if os.path.exists(res_file):
        old_res_df = pd.read_csv(res_file)
        old_res_df.set_index(['model_type', 'description', 'preprocessing description', 'metric'], inplace=True)
    
        res_df = pd.concat([old_res_df, new_res_df])
        res_df.to_csv(res_file)
    else:
        new_res_df.to_csv(res_file)
        

    return model, results

In [73]:
def results_to_df(model_type, model_desc, preproc_desc, result_dict):
    res_df = pd.DataFrame.from_dict(result_dict)
    res_df['model_type'] = model_type
    res_df['description'] = model_desc
    res_df['preprocessing description'] = preproc_desc
    res_df.reset_index(inplace=True)
    res_df.rename({'index':'metric'}, axis=1, inplace=True)
    res_df.set_index(['model_type', 'description', 'preprocessing description', 'metric'], inplace=True)

    return res_df
    

In [79]:
outfile=os.path.join(gutenberg_corpus_analysis_repo,'results', 'MLP.csv')
preprocessing_description = '10 chunks, 1000 long, tf-idf 15000 max feat'

# Neural Networks

## Multi Layer Perceptron

## 1 Hidden Layer, sizes 100, 200, 500

In [83]:
model = neural_network.MLPClassifier()
model_desc='default'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 75.8631980419159 seconds = 1.2643866340319316 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9000 |          0.9156 |          0.9000 |          0.8987 |
|test            ||           0.9042 |          0.9194 |          0.9042 |          0.8982 |
--------------------------------------------------------------------------------------------


In [100]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(200,))
model_desc='hidden_layer_size 200'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 125.62376499176025 seconds = 2.0937294165293374 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9125 |          0.9277 |          0.9125 |          0.9102 |
|test            ||           0.9125 |          0.9237 |          0.9125 |          0.9071 |
--------------------------------------------------------------------------------------------


In [109]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(500,))
model_desc='hidden_layer_size 500'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 296.95596957206726 seconds = 4.949266159534455 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9042 |          0.9231 |          0.9042 |          0.9013 |
|test            ||           0.9125 |          0.9129 |          0.9125 |          0.9036 |
--------------------------------------------------------------------------------------------


## 2 Hidden Layers, sizes 100, 200, 500

In [95]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(100,100,))
model_desc='2 hidden_layer_size 100'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 50.64549684524536 seconds = 0.8440916140874227 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.8583 |          0.8925 |          0.8583 |          0.8575 |
|test            ||           0.8708 |          0.8837 |          0.8708 |          0.8608 |
--------------------------------------------------------------------------------------------


In [101]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(200,200,))
model_desc='2 hidden_layer_size 200'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 89.51257276535034 seconds = 1.491876212755839 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.8917 |          0.9131 |          0.8917 |          0.8904 |
|test            ||           0.8833 |          0.9029 |          0.8833 |          0.8785 |
--------------------------------------------------------------------------------------------


In [110]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(500,500,))
model_desc='2 hidden_layer_size 500'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 202.14327549934387 seconds = 3.3690545916557313 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.8750 |          0.8992 |          0.8750 |          0.8726 |
|test            ||           0.9000 |          0.9131 |          0.9000 |          0.8921 |
--------------------------------------------------------------------------------------------


## 3 Hidden Layers, sizes 100, 200, 500

In [96]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(100,100,100,))
model_desc='3 hidden_layer_size 100'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 40.618685722351074 seconds = 0.6769780953725179 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.7792 |          0.8392 |          0.7792 |          0.7762 |
|test            ||           0.7917 |          0.8172 |          0.7917 |          0.7796 |
--------------------------------------------------------------------------------------------


In [102]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(200,200,200,))
model_desc='3 hidden_layer_size 200'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 63.55175423622131 seconds = 1.0591959039370218 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.7958 |          0.8395 |          0.7958 |          0.7894 |
|test            ||           0.8042 |          0.8554 |          0.8042 |          0.7998 |
--------------------------------------------------------------------------------------------


In [111]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(500,500,500,))
model_desc='3 hidden_layer_size 500'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 154.4944007396698 seconds = 2.5749066789944965 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.8583 |          0.8971 |          0.8583 |          0.8599 |
|test            ||           0.8417 |          0.8660 |          0.8417 |          0.8305 |
--------------------------------------------------------------------------------------------


## Miscellaneous networks

In [565]:
model = neural_network.MLPClassifier(activation='logistic', max_iter=500)
model_desc='logistic act, 500 iter'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 99.63221025466919 seconds = 1.6605368375778198 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9167 |          0.9344 |          0.9167 |          0.9151 |
|test            ||           0.9042 |          0.9162 |          0.9042 |          0.8997 |
--------------------------------------------------------------------------------------------


In [596]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(200,), activation='logistic', max_iter=500)
model_desc='hidden layer size 200, logistic act, 500 iter'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 273.6840138435364 seconds = 4.5614002307256065 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9083 |          0.9356 |          0.9083 |          0.9081 |
|test            ||           0.9250 |          0.9350 |          0.9250 |          0.9187 |
--------------------------------------------------------------------------------------------


In [612]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(200,), activation='logistic', max_iter=500, learning_rate='invscaling', solver='sgd')
model_desc='hidden layer size 200, logistic act, 500 iter, sgd-invscaling'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 8.986948013305664 seconds = 0.14978246688842772 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           0.0125 |          0.0002 |          0.0125 |          0.0003 |
|validation      ||           0.0125 |          0.0002 |          0.0125 |          0.0003 |
|test            ||           0.0125 |          0.0002 |          0.0125 |          0.0003 |
--------------------------------------------------------------------------------------------


In [604]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(500,), activation='relu', max_iter=500, learning_rate='invscaling', solver='sgd')
model_desc='hidden layer size 500, relu act, 500 iter, sgd-invscaling'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 21.707059860229492 seconds = 0.36178433100382484 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           0.0109 |          0.0005 |          0.0109 |          0.0009 |
|validation      ||           0.0125 |          0.0005 |          0.0125 |          0.0010 |
|test            ||           0.0083 |          0.0002 |          0.0083 |          0.0004 |
--------------------------------------------------------------------------------------------


In [617]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(500,), activation='relu', max_iter=500, learning_rate='constant', solver='sgd')
model_desc='hidden layer size 500, relu act, 500 iter, sgd-constant'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)



Training and predicting took 851.5928704738617 seconds = 14.193214507897695 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           0.4568 |          0.7121 |          0.4568 |          0.4622 |
|validation      ||           0.3250 |          0.3742 |          0.3250 |          0.3133 |
|test            ||           0.3375 |          0.3766 |          0.3375 |          0.3095 |
--------------------------------------------------------------------------------------------


In [614]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(500,), activation='relu', max_iter=500, solver='adam')
model_desc='hidden layer size 500, relu act, 500 iter, adam'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 185.4526436328888 seconds = 3.0908773938814798 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9292 |          0.9440 |          0.9292 |          0.9265 |
|test            ||           0.9167 |          0.9225 |          0.9167 |          0.9103 |
--------------------------------------------------------------------------------------------


In [609]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(500,200, 100, ), activation='relu', max_iter=500, learning_rate='invscaling', solver='sgd')
model_desc='hidden layer size 500-200-100, relu act, 500 iter, sgd-invscaling'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 23.782085180282593 seconds = 0.3963680863380432 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           0.0146 |          0.0004 |          0.0146 |          0.0007 |
|validation      ||           0.0083 |          0.0002 |          0.0083 |          0.0004 |
|test            ||           0.0083 |          0.0002 |          0.0083 |          0.0005 |
--------------------------------------------------------------------------------------------


In [618]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(500,200, 100, ), activation='relu', max_iter=500, learning_rate='constant', solver='sgd')
model_desc='hidden layer size 500-200-100, relu act, 500 iter, sgd-constant'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 138.3572220802307 seconds = 2.3059537013371787 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           0.0198 |          0.0008 |          0.0198 |          0.0015 |
|validation      ||           0.0083 |          0.0004 |          0.0083 |          0.0007 |
|test            ||           0.0292 |          0.0013 |          0.0292 |          0.0024 |
--------------------------------------------------------------------------------------------


In [615]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(500,200, 100, ), activation='relu', max_iter=500, solver='adam')
model_desc='hidden layer size 500-200-100, relu act, 500 iter, adam'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 127.03732872009277 seconds = 2.1172888120015463 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.8833 |          0.9142 |          0.8833 |          0.8776 |
|test            ||           0.8375 |          0.8545 |          0.8375 |          0.8272 |
--------------------------------------------------------------------------------------------


In [606]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(200,100, ), activation='relu', max_iter=500, learning_rate='invscaling', solver='sgd')
model_desc='hidden layer size 200-100, relu act, 500 iter, sgd-invscaling'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 8.672466516494751 seconds = 0.14454110860824584 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           0.0135 |          0.0008 |          0.0135 |          0.0013 |
|validation      ||           0.0167 |          0.0010 |          0.0167 |          0.0017 |
|test            ||           0.0125 |          0.0002 |          0.0125 |          0.0003 |
--------------------------------------------------------------------------------------------


In [616]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(200,100, ), activation='relu', max_iter=500, solver='adam')
model_desc='hidden layer size 200-100, relu act, 500 iter, adam'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 67.76165580749512 seconds = 1.1293609301249186 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.8958 |          0.9117 |          0.8958 |          0.8930 |
|test            ||           0.9000 |          0.9237 |          0.9000 |          0.8950 |
--------------------------------------------------------------------------------------------


In [607]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(200,200, ), activation='relu', max_iter=500, learning_rate='invscaling', solver='sgd')
model_desc='hidden layer size 200-200, relu act, 500 iter, sgd-invscaling'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 8.011568307876587 seconds = 0.13352613846460978 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           0.0125 |          0.0002 |          0.0125 |          0.0003 |
|validation      ||           0.0125 |          0.0002 |          0.0125 |          0.0003 |
|test            ||           0.0125 |          0.0002 |          0.0125 |          0.0003 |
--------------------------------------------------------------------------------------------


In [619]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(200,200, ), activation='relu', max_iter=500, learning_rate='constant', solver='sgd')
model_desc='hidden layer size 200-200, relu act, 500 iter, sgd-constant'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 27.243923902511597 seconds = 0.4540653983751933 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           0.0135 |          0.0004 |          0.0135 |          0.0007 |
|validation      ||           0.0125 |          0.0003 |          0.0125 |          0.0006 |
|test            ||           0.0125 |          0.0003 |          0.0125 |          0.0006 |
--------------------------------------------------------------------------------------------


In [620]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(200,200, ), activation='relu', max_iter=500, solver='adam')
model_desc='hidden layer size 200-200, relu act, 500 iter, adam'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 58.8208589553833 seconds = 0.9803476492563884 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.8792 |          0.8996 |          0.8792 |          0.8734 |
|test            ||           0.8750 |          0.8904 |          0.8750 |          0.8653 |
--------------------------------------------------------------------------------------------


In [636]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(100,50, ), activation='relu', max_iter=500, solver='adam')
model_desc='hidden layer size 100-50, relu act, 500 iter, adam'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 28.916454076766968 seconds = 0.48194090127944944 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.8833 |          0.9079 |          0.8833 |          0.8819 |
|test            ||           0.8792 |          0.8940 |          0.8792 |          0.8689 |
--------------------------------------------------------------------------------------------


In [637]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(100,100, ), activation='relu', max_iter=500, solver='adam')
model_desc='hidden layer size 100-100, relu act, 500 iter, adam'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 33.85341835021973 seconds = 0.5642236391703288 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.8292 |          0.8498 |          0.8292 |          0.8212 |
|test            ||           0.8875 |          0.9099 |          0.8875 |          0.8815 |
--------------------------------------------------------------------------------------------


In [638]:
model = neural_network.MLPClassifier(hidden_layer_sizes=(100,100, 100,), activation='relu', max_iter=500, solver='adam')
model_desc='hidden layer size 100-100-100, relu act, 500 iter, adam'
model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, model_desc, preprocessing_description)

Training and predicting took 25.500365018844604 seconds = 0.42500608364741005 minutes
Model: MLPClassifier
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.8292 |          0.8617 |          0.8292 |          0.8239 |
|test            ||           0.8208 |          0.8352 |          0.8208 |          0.8115 |
--------------------------------------------------------------------------------------------
