# Use this script to quickly test many model types against a single set of parameters

Currently supports only lemmatized
Will later support: stemming, no stemming/lemmatizing

In [None]:
######################
# Set these parameters
######################

# If chunking is set to False, the full text is used
CHUNKING = True
#####
# Only used if chunking is true
NUM_CHUNKS = 10
CHUNK_SIZE = 100
CHUNK_OVERLAP=False
#####
# TF-IDF Parameters
TF_IDF_MAX_FEATURES=15000
TF_IDF_N_GRAM=(1,3)

LEMMATIZATION = True

In [None]:
import os
import time
import tqdm
import random
import itertools

from tqdm.contrib.concurrent import process_map


import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm, neural_network
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, classification_report, make_scorer)
import multiprocessing as mp

from src import data_loader

%load_ext autoreload
%autoreload 2

In [None]:
repos_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

In [None]:
gutenberg_repo_path = os.path.join(repos_path, 'gutenberg')
gutenberg_analysis_repo = os.path.join(repos_path, 'gutenberg-analysis')

src_dir = os.path.join(gutenberg_analysis_repo,'src')
sys.path.append(src_dir)
from data_io import get_book


gutenberg_src_dir = os.path.join(gutenberg_repo_path,'src')
sys.path.append(gutenberg_src_dir)

from metaquery import meta_query

gca_path = os.path.abspath(os.getcwd())
sys.path.append(gca_path)
import misc_utils.dataset_filtering as dataset_filtering

In [None]:
dataset='nikita_dataset'

In [None]:
# If you wish to specify the file locations of the DFS:
# load_dfs takes arguments dataset_folder, train_csv, val_csv and test_csv
train_df, val_df, test_df = data_loader.load_dfs(dataset)

In [None]:
subj = train_df['subjects'].replace('set()',np.nan)
subj_docs = []
for h in subj:
    try:
        h = h.strip("{}")[1:-1]
    except AttributeError:
        subj_docs.append(h)
        continue
    h = h.replace(' -- ', '-')
    h = h.replace("', '","_")
    h = h.split('_')
    h = [item.replace(' ','').replace(',', ' ') for item in h]
    h = ' '.join(h)
    subj_docs.append(h)

In [None]:
train_df['subj_str']=subj_docs

In [None]:
train_df['text'] = train_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))
test_df['text'] = test_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))
val_df['text'] = val_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))

In [None]:
# Define a function to apply the word, line and token counts
def enrich_dataframe(df):
    count_path = os.path.join(gutenberg_repo_path, 'data', 'counts')
    text_path = os.path.join(gutenberg_repo_path, 'data', 'text')
    token_path = os.path.join(gutenberg_repo_path, 'data', 'tokens')

    df['word_count'] = df['id'].apply(lambda pid: dataset_filtering.get_word_count(pid, count_path))
    df['unique_word_count'] = df['id'].apply(lambda pid: dataset_filtering.get_unique_word_count(pid, count_path))
    df['line_count'] = df['id'].apply(lambda pid: dataset_filtering.get_line_count(pid, text_path))
    df['token_count'] = df['id'].apply(lambda pid: dataset_filtering.get_token_count(pid, token_path))

    return df

train_df = enrich_dataframe(train_df)

In [None]:
train_df['word_count'].describe()

In [None]:
def skip_start_and_end(text, num_chars=100):
    text = text.split(' ')
    text = text[num_chars:-num_chars]
    return ' '.join(text)

train_df['text'] = train_df['text'].apply(skip_start_and_end)
test_df['text'] = test_df['text'].apply(skip_start_and_end)
val_df['text'] = val_df['text'].apply(skip_start_and_end)


In [None]:
def make_random_chunks(text, num_chunks=10, chunk_size=1000, overlap=False):
    chunk = []
    words = text.split(' ')

    if num_chunks * chunk_size > len(words):
        return text
    for i in range(num_chunks):
        new_words = []
        num_words = len(words)
        if chunk_size > num_words:
            chunk = chunk + words
            words = []
            return ' '.join(chunk)
        start = random.randint(0, num_words)
        chunk = [*chunk,  *words[start:start+chunk_size]]
        #print(chunk)
        if start == 0:
            words = words[chunk_size:]
        elif start == num_words - chunk_size:
            words = words[0:start]
        else:
            words = words[0:start] + words[start+chunk_size:]
    return ' '.join(chunk)

In [None]:
if CHUNKING is True:

    train_df['text'] = train_df['text'].apply(lambda x: make_random_chunks(x, num_chunks=NUM_CHUNKS, chunk_size = CHUNK_SIZE, overlap=CHUNK_OVERLAP))
    test_df['text'] = test_df['text'].apply(lambda x: make_random_chunks(x, num_chunks=NUM_CHUNKS, chunk_size = CHUNK_SIZE, overlap=CHUNK_OVERLAP))
    val_df['text'] = val_df['text'].apply(lambda x: make_random_chunks(x, num_chunks=NUM_CHUNKS, chunk_size = CHUNK_SIZE, overlap=CHUNK_OVERLAP))

In [None]:

x_col = 'text'
tokenized_col = 'tokenized'

In [None]:

train_df[tokenized_col] = process_map(word_tokenize, train_df[x_col], max_workers=11, chunksize=5)
test_df[tokenized_col] = process_map(word_tokenize, test_df[x_col], max_workers=11, chunksize=5)
val_df[tokenized_col] = process_map(word_tokenize, val_df[x_col], max_workers=11, chunksize=5)


In [None]:
train_df['tokenized'].isnull().any()

In [None]:
if train_df['tokenized'].isnull().any():
    print('Warning: There are null elements in train_df')

if val_df['tokenized'].isnull().any():
    print('Warning: There are null elements in val_df')

if test_df['tokenized'].isnull().any():
    print('Warning: There are null elements in test_df')




In [None]:
# Save out the tokenized full text, so you don't have to run this again later
# if you so desire

outfile=os.path.join(gca_path, 'tokenized', 'train_df_full_text_tokenized.pkl')
train_df.to_pickle(outfile)

outfile=os.path.join(gca_path, 'tokenized', 'test_df_full_text_tokenized.pkl')
test_df.to_pickle(outfile)

outfile=os.path.join(gca_path, 'tokenized', 'val_df_full_text_tokenized.pkl')
val_df.to_pickle(outfile)

In [None]:
def stem_that_thang(tokenized_text):
    # Declaring Empty List to store the words that follow the rules for this step
    final_words = []
    # Initializing PorterStemmer()
    ps = PorterStemmer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word in tokenized_text:
        if word not in stopwords.words('english'):
            word_final = ps.stem(word)
            final_words.append(word_final)

    return str(final_words)
    # The final processed set of words for each iteration will be stored in 'text_final'

In [None]:
def lemmatize_that_thang(tokenized_text):
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV  
    
    # Declaring Empty List to store the words that follow the rules for this step
    final_words = []
    # Initializing WordNetLemmatizer()
    word_lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(tokenized_text):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_lemmatized.lemmatize(word,tag_map[tag[0]])
            final_words.append(word_final)
    return str(final_words)
    # The final processed set of words for each iteration will be stored in 'text_final'

## Lemmatize the text and save out the DFs

In [None]:
train_df['lemmatized'] = process_map(lemmatize_that_thang, train_df['tokenized'], max_workers=10, chunksize=5)
val_df['lemmatized'] = process_map(lemmatize_that_thang, val_df['tokenized'], max_workers=10, chunksize=5)
test_df['lemmatized'] = process_map(lemmatize_that_thang, test_df['tokenized'], max_workers=10, chunksize=5)



In [None]:
outfile=os.path.join(gca_path, 'nikita_dataset', 'train_df_full_text_lemmatized.pkl')
train_df.to_pickle(outfile)

outfile=os.path.join(gca_path, 'nikita_dataset', 'val_df_full_text_lemmatized.pkl')
val_df.to_pickle(outfile)

outfile=os.path.join(gca_path, 'nikita_dataset', 'test_df_full_text_lemmatized.pkl')
test_df.to_pickle(outfile)

# Read in the lemmatized texts, if you have already saved them out

In [None]:
train_pkl = os.path.join(gca_path, 'nikita_dataset', 'train_df_full_text_lemmatized.pkl')
val_pkl = os.path.join(gca_path, 'nikita_dataset', 'val_df_full_text_lemmatized.pkl')
test_pkl = os.path.join(gca_path, 'nikita_dataset', 'test_df_full_text_lemmatized.pkl')

train_df = pd.read_pickle(train_pkl)
val_df = pd.read_pickle(val_pkl)
test_df = pd.read_pickle(test_pkl)

## Save a version of the dataframe without the text column, for memory reasons

In [None]:
train_df.drop('text', axis=1, inplace=True)
outfile=os.path.join(gca_path, 'nikita_dataset', 'train_df_full_text_lemmatized_noRawText.pkl')
train_df.to_pickle(outfile)

val_df.drop('text', axis=1, inplace=True)
outfile=os.path.join(gca_path, 'nikita_dataset', 'val_df_full_text_lemmatized_noRawText.pkl')
val_df.to_pickle(outfile)

test_df.drop('text', axis=1, inplace=True)
outfile=os.path.join(gca_path, 'nikita_dataset', 'test_df_full_text_lemmatized_noRawText.pkl')
test_df.to_pickle(outfile)


## Drop both the tokenization and text columns, for memory reasons

In [None]:
train_df.drop(['tokenized', 'text'], axis=1, inplace=True)
val_df.drop(['tokenized', 'text'], axis=1, inplace=True)
test_df.drop(['tokenized', 'text'], axis=1, inplace=True)

In [None]:
outfile=os.path.join(gca_path, 'nikita_dataset', 'test_df_full_text_lemmatizedOnly.pkl')
train_df.to_pickle(outfile)

outfile=os.path.join(gca_path, 'nikita_dataset', 'val_df_full_text_lemmatizedOnly.pkl')
val_df.to_pickle(outfile)

outfile=os.path.join(gca_path, 'nikita_dataset', 'test_df_full_text_lemmatizedOnly.pkl')
test_df.to_pickle(outfile)

# Lemmatization, English stop words

In [None]:
#x_col = 'tokenized'
x_col='lemmatized'

In [None]:
Train_X, Train_Y = train_df[x_col], train_df['author']
Test_X, Test_Y = test_df[x_col], test_df['author']
Val_X, Val_Y = val_df[x_col], val_df['author']

In [None]:
Encoder = LabelEncoder()
Train_Y_e = Encoder.fit_transform(Train_Y)
Test_Y_e = Encoder.fit_transform(Test_Y)
Val_Y_e = Encoder.fit_transform(Val_Y)


In [None]:

Tfidf_vect = TfidfVectorizer(
    stop_words='english', # Removes a lot of common english words like it, and, that, is etc. Uses predifined scikit list of common english words.
    sublinear_tf=True, # Uses logarithmic word frequency weighting, reducing the weight of extremely frequent terms & helps prevent domination by larger text files
    max_features=TF_IDF_MAX_FEATURES, # Consideration for both overfitting and computational requirements.
    ngram_range=TF_IDF_N_GRAM
)
Tfidf_vect.fit_transform(train_df[x_col])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
Val_X_Tfidf = Tfidf_vect.transform(Val_X)


In [None]:
#print(Tfidf_vect.vocabulary_)

In [None]:
def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test, res_file, model_description, preproc_desc):
    # Train and predict
    start = time.time()
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    end = time.time()

    print(f'Training and predicting took {end-start} seconds = {(end-start)/60} minutes')

    results={}
    for label, y_truth, y_pred in [('train', y_train, y_train_pred), 
                            ('validation', y_val, y_val_pred),
                            ('test', y_test, y_test_pred)]:
        # Metrics (set zero_division=0 to silence warnings)
        acc = accuracy_score(y_truth, y_pred)
        f1 = f1_score(y_truth, y_pred, average='weighted', zero_division=0)
        precision = precision_score(y_truth, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_truth, y_pred, average='weighted', zero_division=0)

        result_dict = {'accuracy': acc,
                       'precision': precision,
                       'recall' : recall,
                       'f1' : f1}
        results[label] = result_dict
        
        
    # Print performance
    print(f"Model: {model.__class__.__name__}")
    print(f'Description: {model_description}')
    print(f'Pre-processing: {preproc_desc}')
    label_str=f'|{'':<15} ||  {'Accuracy':>15} | {'Precision':>15} | {'Recall':>15} | {'F1-Score':>15} |'
    print("-" * len(label_str))

    print(label_str)
    print("-" * len(label_str))

    for result_label, sub_res_dict in results.items():
        output_str = f'|{result_label:<15} || '
        
        for key, val in sub_res_dict.items():
            output_str += f' {val:15.4f} |'
        print(output_str)

    print("-" * len(label_str))

    new_res_df = results_to_df(model.__class__.__name__, model_description, preproc_desc, results)
    
    if os.path.exists(res_file):
        old_res_df = pd.read_csv(res_file)
        old_res_df.set_index(['model_type', 'description', 'preprocessing description', 'metric'], inplace=True)
    
        res_df = pd.concat([old_res_df, new_res_df])
        res_df.to_csv(res_file)
    else:
        new_res_df.to_csv(res_file)
        

    return model, results

In [None]:
def results_to_df(model_type, model_desc, preproc_desc, result_dict):
    res_df = pd.DataFrame.from_dict(result_dict)
    res_df['model_type'] = model_type
    res_df['description'] = model_desc
    res_df['preprocessing description'] = preproc_desc
    res_df.reset_index(inplace=True)
    res_df.rename({'index':'metric'}, axis=1, inplace=True)
    res_df.set_index(['model_type', 'description', 'preprocessing description', 'metric'], inplace=True)

    return res_df
    

In [None]:
def train_those_models(Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, preprocessing_description):

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.SVC()
    model_desc = 'default_settings'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    model_desc = 'C=1, kernel Linear, deg 3, gamma auto'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # No Effect
    # # Classifier - Algorithm - SVM
    # model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='scale')
    # model_desc='C=1, kernel Linear, deg 3 gamma scale'
    # model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
    #                                     outfile, model_desc, preprocessing_description)


    # Usually poor results
    # Classifier - Algorithm - SVM
    # model = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
    # model_desc='C=1, kernel rbf, deg 3 gamma auto'
    # model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
    #                                     outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    model = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='scale')
    model_desc='C=1, kernel rbf, deg 3 gamma scale'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    model = svm.SVC(C=1.0, kernel='poly', degree=4, gamma='scale')
    model_desc='C=1, kernel poly, deg 4 gamma scale'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    model = svm.SVC(C=1.0, kernel='poly', degree=2, gamma='scale')
    model_desc='C=1, kernel poly, deg 2 gamma scale'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    model = svm.SVC(C=1.0, kernel='poly', degree=3, gamma='scale')
    model_desc='C=1, kernel poly, deg 3 gamma scale'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # usually poor results
    # Classifier - Algorithm - SVM
    # model = svm.SVC(C=1.0, kernel='poly', degree=3, gamma='auto')
    # model_desc='C=1, kernel poly, deg 3 gamma auto'
    # model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
    #                                     outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    model = svm.SVC(C=1.0, kernel='sigmoid', degree=3, gamma='scale')
    model_desc='C=1, kernel sigmoid, deg 3 gamma scale'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Usually poor results
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    # model = svm.SVC(C=1.0, kernel='sigmoid', degree=3, gamma='auto')
    # model_desc='C=1, kernel sigmoid, deg 3 gamma auto'
    # model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
    #                                     outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC()
    model_desc='default'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=10)
    model_desc='C=10'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=10, max_iter=5000)
    model_desc='C=10, max_it 5000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=10, max_iter=50000)
    model_desc='C=10, max_it 50000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=20, max_iter=5000)
    model_desc='C=20, max_it 5000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=20, max_iter=50000)
    model_desc='C=20, max_it 50000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=10, multi_class='crammer_singer', max_iter=5000)
    model_desc='C=10, crammer-singer, max_it 5000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=10, multi_class='crammer_singer', max_iter=50000)
    model_desc='C=10, crammer-singer, max_it 5000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=20, multi_class='crammer_singer', max_iter=5000)
    model_desc='C=20, crammer-singer, max_it 5000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=20, multi_class='crammer_singer', max_iter=50000)
    model_desc='C=20, crammer-singer, max_it 50000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    
    
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.NuSVC()
    model_desc='default'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # fit the training dataset on the NB classifier
    model = naive_bayes.MultinomialNB()
    model_desc='default'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    model = neural_network.MLPClassifier()
    model_desc='default'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    model = neural_network.MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000)
    model_desc='hidden_layer_size 200, max it 1000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    model = neural_network.MLPClassifier(activation='logistic', max_iter=500)
    model_desc='logistic act, 500 iter'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)


In [None]:
x_col

In [None]:
Tfidf_vect = TfidfVectorizer(
    stop_words='english', # Removes a lot of common english words like it, and, that, is etc. Uses predifined scikit list of common english words.
    sublinear_tf=True, # Uses logarithmic word frequency weighting, reducing the weight of extremely frequent terms & helps prevent domination by larger text files
    max_features=10000, # Consideration for both overfitting and computational requirements.
    ngram_range=(1,2)
)
Tfidf_vect.fit_transform(train_df[x_col])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
Val_X_Tfidf = Tfidf_vect.transform(Val_X)

outfile='/home/dean/Documents/gitRepos/gutenberg_corpus_analysis/SVM/results_full_text_1.csv'
preprocessing_description='tf-idf sublinear_tf true, max feat 10000, ngram (1,2)'
train_those_models(Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, preprocessing_description)
