# Compares TF-IDF Vectorization parameters using the Full lemmatized text

Loops through all combinations of:

**TF-IDF Max Features**: 10000, 15000, 20000, 5000, 40000, 80000,

**TF-IDF N-Grams**: (1,1), (1,2), (1,3), (2, 2), (2, 3), (3, 3)]


In [5]:
import os
import time
import tqdm
import random
import itertools

from tqdm.contrib.concurrent import process_map


import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm, neural_network
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, classification_report, make_scorer)
import multiprocessing as mp

In [2]:
np.random.seed(500)

In [9]:
repos_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

In [11]:
gutenberg_repo_path = os.path.join(repos_path, 'gutenberg')
gutenberg_analysis_repo = os.path.join(repos_path, 'gutenberg-analysis')

src_dir = os.path.join(gutenberg_analysis_repo,'src')
sys.path.append(src_dir)
from data_io import get_book


gutenberg_src_dir = os.path.join(gutenberg_repo_path,'src')
sys.path.append(gutenberg_src_dir)

from metaquery import meta_query

gca_path = os.path.abspath(os.getcwd())
sys.path.append(gca_path)
import misc_utils.dataset_filtering as dataset_filtering

In [9]:
text_fold=os.path.join(gutenberg_repo_path, 'data', 'text')

In [11]:
dataset='nikita_dataset'

In [13]:
train_csv = os.path.join(gca_path, dataset, 'final_train.csv')
test_csv = os.path.join(gca_path, dataset, 'final_test.csv')
val_csv = os.path.join(gca_path, dataset, 'final_val.csv')

pg_catalog_filepath=os.path.join(gutenberg_repo_path, 'metadata', 'pg_catalog.csv')

In [None]:
train_df = pd.read_csv(train_csv, index_col='Unnamed: 0')
test_df = pd.read_csv(test_csv, index_col='Unnamed: 0')
val_df = pd.read_csv(val_csv, index_col='Unnamed: 0')

train_df.head()

In [None]:
test_df.shape

In [None]:
val_df.shape

In [None]:
val_df['author'].value_counts()

In [None]:
subj = train_df['subjects'].replace('set()',np.nan)
subj_docs = []
for h in subj:
    try:
        h = h.strip("{}")[1:-1]
    except AttributeError:
        subj_docs.append(h)
        continue
    h = h.replace(' -- ', '-')
    h = h.replace("', '","_")
    h = h.split('_')
    h = [item.replace(' ','').replace(',', ' ') for item in h]
    h = ' '.join(h)
    subj_docs.append(h)

In [None]:
train_df['subj_str']=subj_docs

In [None]:
train_df['text'] = train_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))
test_df['text'] = test_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))
val_df['text'] = val_df['id'].apply(lambda x: get_book(x, path_gutenberg=gutenberg_repo_path,level='text'))

In [None]:
# Define a function to apply the word, line and token counts
def enrich_dataframe(df):
    count_path = os.path.join(gutenberg_repo_path, 'data', 'counts')
    text_path = os.path.join(gutenberg_repo_path, 'data', 'text')
    token_path = os.path.join(gutenberg_repo_path, 'data', 'tokens')

    df['word_count'] = df['id'].apply(lambda pid: dataset_filtering.get_word_count(pid, count_path))
    df['unique_word_count'] = df['id'].apply(lambda pid: dataset_filtering.get_unique_word_count(pid, count_path))
    df['line_count'] = df['id'].apply(lambda pid: dataset_filtering.get_line_count(pid, text_path))
    df['token_count'] = df['id'].apply(lambda pid: dataset_filtering.get_token_count(pid, token_path))

    return df

train_df = enrich_dataframe(train_df)

In [None]:
train_df['word_count'].describe()

In [None]:
def skip_start_and_end(text, num_chars=100):
    text = text.split(' ')
    text = text[num_chars:-num_chars]
    return ' '.join(text)

train_df['text'] = train_df['text'].apply(skip_start_and_end)
test_df['text'] = test_df['text'].apply(skip_start_and_end)
val_df['text'] = val_df['text'].apply(skip_start_and_end)


In [None]:

x_col = 'text'
tokenized_col = 'tokenized'

In [None]:
from tqdm.contrib.concurrent import process_map
start = time.time()
tokenized = process_map(word_tokenize, train_df[x_col], max_workers=11, chunksize=5)

end = time.time()
print(f'Took {end-start} seconds')
train_df[tokenized_col] = tokenized

In [None]:

start = time.time()
tokenized = process_map(word_tokenize, test_df[x_col], max_workers=11, chunksize=5)

end = time.time()
print(f'Took {end-start} seconds')
test_df[tokenized_col] = tokenized

In [None]:

start = time.time()
tokenized = process_map(word_tokenize, val_df[x_col], max_workers=11, chunksize=5)

end = time.time()
print(f'Took {end-start} seconds')
val_df[tokenized_col] = tokenized

In [None]:
# Save out the tokenized full text, so you don't have to run this again later
# if you so desire

outfile=os.path.join(gca_path, 'tokenized', 'train_df_full_text_tokenized.pkl')
train_df.to_pickle(outfile)

outfile=os.path.join(gca_path, 'tokenized', 'test_df_full_text_tokenized.pkl')
test_df.to_pickle(outfile)

outfile=os.path.join(gca_path, 'tokenized', 'val_df_full_text_tokenized.pkl')
val_df.to_pickle(outfile)

In [15]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV                

In [None]:
def stem_that_thang(tokenized_text):
    # Declaring Empty List to store the words that follow the rules for this step
    final_words = []
    # Initializing PorterStemmer()
    ps = PorterStemmer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word in tokenized_text:
        if word not in stopwords.words('english'):
            word_final = ps.stem(word)
            final_words.append(word_final)

    return str(final_words)
    # The final processed set of words for each iteration will be stored in 'text_final'

In [None]:
def lemmatize_that_thang(tokenized_text):
    # Declaring Empty List to store the words that follow the rules for this step
    final_words = []
    # Initializing WordNetLemmatizer()
    word_lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(tokenized_text):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_lemmatized.lemmatize(word,tag_map[tag[0]])
            final_words.append(word_final)
    return str(final_words)
    # The final processed set of words for each iteration will be stored in 'text_final'

In [None]:
train_df.shape

In [None]:
train_df[train_df['tokenized'].isnull()]

In [None]:
test_df[test_df['tokenized'].isnull()]

In [None]:
val_df[val_df['tokenized'].isnull()]

## Lemmatize the text and save out the DFs

In [None]:

start = time.time()
lemmatized = process_map(lemmatize_that_thang, train_df['tokenized'], max_workers=10, chunksize=5)

end = time.time()
print(f'Took {end-start} seconds')
train_df['lemmatized'] = lemmatized


In [None]:

start = time.time()
lemmatized = process_map(lemmatize_that_thang, val_df['tokenized'], max_workers=10, chunksize=5)

end = time.time()
print(f'Took {end-start} seconds')
val_df['lemmatized'] = lemmatized


In [None]:

start = time.time()
lemmatized = process_map(lemmatize_that_thang, test_df['tokenized'], max_workers=10, chunksize=5)

end = time.time()
print(f'Took {end-start} seconds')
test_df['lemmatized'] = lemmatized


In [None]:
outfile=os.path.join(gca_path, 'nikita_dataset', 'train_df_full_text_lemmatized.pkl')
train_df.to_pickle(outfile)

outfile=os.path.join(gca_path, 'nikita_dataset', 'val_df_full_text_lemmatized.pkl')
val_df.to_pickle(outfile)

outfile=os.path.join(gca_path, 'nikita_dataset', 'test_df_full_text_lemmatized.pkl')
test_df.to_pickle(outfile)

# Read in the lemmatized texts, if you have already saved them out

In [29]:
train_pkl = os.path.join(gca_path, 'nikita_dataset', 'train_df_full_text_lemmatized.pkl')
val_pkl = os.path.join(gca_path, 'nikita_dataset', 'val_df_full_text_lemmatized.pkl')
test_pkl = os.path.join(gca_path, 'nikita_dataset', 'test_df_full_text_lemmatized.pkl')

train_df = pd.read_pickle(train_pkl)
val_df = pd.read_pickle(val_pkl)
test_df = pd.read_pickle(test_pkl)

## Save a version of the dataframes without the text column, for memory reasons

In [30]:
train_df.drop('text', axis=1, inplace=True)
outfile=os.path.join(gca_path, 'nikita_dataset', 'train_df_full_text_lemmatized_noRawText.pkl')
train_df.to_pickle(outfile)

val_df.drop('text', axis=1, inplace=True)
outfile=os.path.join(gca_path, 'nikita_dataset', 'val_df_full_text_lemmatized_noRawText.pkl')
val_df.to_pickle(outfile)

test_df.drop('text', axis=1, inplace=True)
outfile=os.path.join(gca_path, 'nikita_dataset', 'test_df_full_text_lemmatized_noRawText.pkl')
test_df.to_pickle(outfile)


## Drop both the tokenization and text columns, for memory reasons

In [19]:
train_df.drop(['tokenized', 'text'], axis=1, inplace=True)
val_df.drop(['tokenized', 'text'], axis=1, inplace=True)
test_df.drop(['tokenized', 'text'], axis=1, inplace=True)


In [None]:
outfile=os.path.join(gca_path, 'nikita_dataset', 'test_df_full_text_lemmatizedOnly.pkl')
train_df.to_pickle(outfile)

outfile=os.path.join(gca_path, 'nikita_dataset', 'val_df_full_text_lemmatizedOnly.pkl')
val_df.to_pickle(outfile)

outfile=os.path.join(gca_path, 'nikita_dataset', 'test_df_full_text_lemmatizedOnly.pkl')
test_df.to_pickle(outfile)




# Lemmatization, English stop words

In [24]:
train_df.drop('tokenized', axis=1, inplace=True)
outfile=os.path.join(gca_path, 'nikita_dataset', 'test_df_full_text_lemmatizedOnly.pkl')
train_df.to_pickle(outfile)

In [21]:
set(val_df['author'].unique()) - set(train_df['author'].unique())

set()

In [23]:
#x_col = 'tokenized'
x_col='lemmatized'

In [25]:
Train_X, Train_Y = train_df[x_col], train_df['author']
Test_X, Test_Y = test_df[x_col], test_df['author']
Val_X, Val_Y = val_df[x_col], val_df['author']

In [27]:
Encoder = LabelEncoder()
Train_Y_e = Encoder.fit_transform(Train_Y)
Test_Y_e = Encoder.fit_transform(Test_Y)
Val_Y_e = Encoder.fit_transform(Val_Y)


In [29]:
Train_Y_e

array([30, 30, 30, ..., 70, 70, 70])

In [None]:
#print(Tfidf_vect.vocabulary_)

In [157]:
def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test, res_file, model_description, preproc_desc):
    # Train and predict
    start = time.time()
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    end = time.time()

    print(f'Training and predicting took {end-start} seconds = {(end-start)/60} minutes')

    results={}
    for label, y_truth, y_pred in [('train', y_train, y_train_pred), 
                            ('validation', y_val, y_val_pred),
                            ('test', y_test, y_test_pred)]:
        # Metrics (set zero_division=0 to silence warnings)
        acc = accuracy_score(y_truth, y_pred)
        f1 = f1_score(y_truth, y_pred, average='weighted', zero_division=0)
        precision = precision_score(y_truth, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_truth, y_pred, average='weighted', zero_division=0)

        result_dict = {'accuracy': acc,
                       'precision': precision,
                       'recall' : recall,
                       'f1' : f1}
        results[label] = result_dict
        
        
    # Print performance
    print(f"Model: {model.__class__.__name__}")
    print(f'Description: {model_description}')
    print(f'Pre-processing: {preproc_desc}')
    label_str=f'|{'':<15} ||  {'Accuracy':>15} | {'Precision':>15} | {'Recall':>15} | {'F1-Score':>15} |'
    print("-" * len(label_str))

    print(label_str)
    print("-" * len(label_str))

    for result_label, sub_res_dict in results.items():
        output_str = f'|{result_label:<15} || '
        
        for key, val in sub_res_dict.items():
            output_str += f' {val:15.4f} |'
        print(output_str)

    print("-" * len(label_str))

    new_res_df = results_to_df(model.__class__.__name__, model_description, preproc_desc, results)
    
    if os.path.exists(res_file):
        old_res_df = pd.read_csv(res_file)
        old_res_df.set_index(['model_type', 'description', 'preprocessing description', 'metric'], inplace=True)
    
        res_df = pd.concat([old_res_df, new_res_df])
        res_df.to_csv(res_file)
    else:
        new_res_df.to_csv(res_file)
        

    return model, results

In [158]:
def results_to_df(model_type, model_desc, preproc_desc, result_dict):
    res_df = pd.DataFrame.from_dict(result_dict)
    res_df['model_type'] = model_type
    res_df['description'] = model_desc
    res_df['preprocessing description'] = preproc_desc
    res_df.reset_index(inplace=True)
    res_df.rename({'index':'metric'}, axis=1, inplace=True)
    res_df.set_index(['model_type', 'description', 'preprocessing description', 'metric'], inplace=True)

    return res_df
    

In [208]:
def train_those_models(Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, preprocessing_description):

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.SVC()
    model_desc = 'default_settings'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    model_desc = 'C=1, kernel Linear, deg 3, gamma auto'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # No Effect
    # # Classifier - Algorithm - SVM
    # model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='scale')
    # model_desc='C=1, kernel Linear, deg 3 gamma scale'
    # model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
    #                                     outfile, model_desc, preprocessing_description)


    # Usually poor results
    # Classifier - Algorithm - SVM
    # model = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
    # model_desc='C=1, kernel rbf, deg 3 gamma auto'
    # model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
    #                                     outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    model = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='scale')
    model_desc='C=1, kernel rbf, deg 3 gamma scale'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    model = svm.SVC(C=1.0, kernel='poly', degree=4, gamma='scale')
    model_desc='C=1, kernel poly, deg 4 gamma scale'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    model = svm.SVC(C=1.0, kernel='poly', degree=2, gamma='scale')
    model_desc='C=1, kernel poly, deg 2 gamma scale'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    model = svm.SVC(C=1.0, kernel='poly', degree=3, gamma='scale')
    model_desc='C=1, kernel poly, deg 3 gamma scale'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # usually poor results
    # Classifier - Algorithm - SVM
    # model = svm.SVC(C=1.0, kernel='poly', degree=3, gamma='auto')
    # model_desc='C=1, kernel poly, deg 3 gamma auto'
    # model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
    #                                     outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    model = svm.SVC(C=1.0, kernel='sigmoid', degree=3, gamma='scale')
    model_desc='C=1, kernel sigmoid, deg 3 gamma scale'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Usually poor results
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    # model = svm.SVC(C=1.0, kernel='sigmoid', degree=3, gamma='auto')
    # model_desc='C=1, kernel sigmoid, deg 3 gamma auto'
    # model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
    #                                     outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC()
    model_desc='default'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=10)
    model_desc='C=10'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=10, max_iter=5000)
    model_desc='C=10, max_it 5000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=10, max_iter=50000)
    model_desc='C=10, max_it 50000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=20, max_iter=5000)
    model_desc='C=20, max_it 5000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=20, max_iter=50000)
    model_desc='C=20, max_it 50000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=10, multi_class='crammer_singer', max_iter=5000)
    model_desc='C=10, crammer-singer, max_it 5000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=10, multi_class='crammer_singer', max_iter=50000)
    model_desc='C=10, crammer-singer, max_it 5000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=20, multi_class='crammer_singer', max_iter=5000)
    model_desc='C=20, crammer-singer, max_it 5000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.LinearSVC(C=20, multi_class='crammer_singer', max_iter=50000)
    model_desc='C=20, crammer-singer, max_it 50000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)

    
    
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    model = svm.NuSVC()
    model_desc='default'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    # fit the training dataset on the NB classifier
    model = naive_bayes.MultinomialNB()
    model_desc='default'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    model = neural_network.MLPClassifier()
    model_desc='default'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    model = neural_network.MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000)
    model_desc='hidden_layer_size 200, max it 1000'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)
    
    model = neural_network.MLPClassifier(activation='logistic', max_iter=500)
    model_desc='logistic act, 500 iter'
    model, result_dict = evaluate_model(model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, model_desc, preprocessing_description)


In [162]:
x_col

'lemmatized'

In [210]:
# Parameter grid
param_grid = {
    'tfidf__max_features': [10000, 15000, 20000, 5000, 40000, 80000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
}
iteration_times = []
# Create all combinations of parameters
param_combos = list(itertools.product(param_grid['tfidf__max_features'],
                            param_grid['tfidf__ngram_range']))
overall_start = time.time()
for max_feat, ngram_range in param_combos:
    iter_start = time.time()
    Tfidf_vect = TfidfVectorizer(
    stop_words='english', # Removes a lot of common english words like it, and, that, is etc. Uses predifined scikit list of common english words.
    sublinear_tf=True, # Uses logarithmic word frequency weighting, reducing the weight of extremely frequent terms & helps prevent domination by larger text files
    max_features=max_feat, # Consideration for both overfitting and computational requirements.
    ngram_range=ngram_range
    )
    
    Tfidf_vect.fit_transform(train_df[x_col])
    
    Train_X_Tfidf = Tfidf_vect.transform(Train_X)
    Test_X_Tfidf = Tfidf_vect.transform(Test_X)
    Val_X_Tfidf = Tfidf_vect.transform(Val_X)
    preproc_desc = f'tfidf_feat{max_feat}_ngram{ngram_range[0]}-{ngram_range[1]}'
    outfile=f'/home/dean/Documents/gitRepos/gutenberg_corpus_analysis/SVM/results_full_text_{preproc_desc}.csv'
    train_those_models(Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, preproc_desc)

    iter_end = time.time()

    iteration_times.append((max_feat, ngram_range, iter_end-iter_start))

overall_end=time.time()

total_time = overall_end - overall_start

print(f'This huge fucking run took {total_time/60} minutes')
print(f'That is {total_time/60/60} hours')

print('The iteration times were:')
for max_feat, ngram_range, seconds in iteration_times:
    minutes = seconds/60
    hours = minmutes / 60
    print(f'Features: {max_feat}  ngram: {ngram_range}  minutes: {minutes} = {hours} hours')
      

Training and predicting took 234.60887122154236 seconds = 3.9101478536923726 minutes
Model: SVC
Description: default_settings
Pre-processing: tfidf_feat10000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9583 |          0.9763 |          0.9583 |          0.9587 |
|test            ||           0.9250 |          0.9358 |          0.9250 |          0.9188 |
--------------------------------------------------------------------------------------------
Training and predicting took 200.32057571411133 seconds = 3.3386762619018553 minutes
Model: SVC
Description: C=1, kernel Linear, deg 3, gamma auto
Pre-processing: tfidf_feat10000_ngr



Training and predicting took 166.07028722763062 seconds = 2.7678381204605103 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat10000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9875 |          0.9906 |          0.9875 |          0.9871 |
--------------------------------------------------------------------------------------------




Training and predicting took 176.28008913993835 seconds = 2.938001485665639 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat10000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9875 |          0.9906 |          0.9875 |          0.9871 |
--------------------------------------------------------------------------------------------




Training and predicting took 210.40352964401245 seconds = 3.5067254940668744 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat10000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9875 |          0.9906 |          0.9875 |          0.9871 |
--------------------------------------------------------------------------------------------




Training and predicting took 290.0862715244293 seconds = 4.834771192073822 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat10000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9875 |          0.9906 |          0.9875 |          0.9871 |
--------------------------------------------------------------------------------------------
Training and predicting took 213.53316736221313 seconds = 3.5588861227035524 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat10000_ngram1-1




Training and predicting took 164.69136261940002 seconds = 2.744856043656667 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat10000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 169.92488932609558 seconds = 2.8320814887682597 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat10000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 166.18897819519043 seconds = 2.7698163032531737 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat10000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 155.508216381073 seconds = 2.5918036063512164 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat10000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------
Training and predicting took 231.12466716766357 seconds = 3.8520777861277264 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat10000_ngram1-2




Training and predicting took 178.30440211296082 seconds = 2.9717400352160137 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat10000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 141.25507020950317 seconds = 2.354251170158386 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat10000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 177.35560178756714 seconds = 2.955926696459452 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat10000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 192.98273086547852 seconds = 3.216378847757975 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat10000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------
Training and predicting took 230.49565148353577 seconds = 3.8415941913922627 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat10000_ngram1-3



Training and predicting took 115.16457295417786 seconds = 1.9194095492362977 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat15000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 119.34652972221375 seconds = 1.9891088287035623 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat15000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 121.05381393432617 seconds = 2.017563565572103 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat15000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 104.63378071784973 seconds = 1.7438963452974956 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat15000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------
Training and predicting took 243.31585788726807 seconds = 4.055264298121134 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat15000_ngram1-1



Training and predicting took 160.82685232162476 seconds = 2.680447538693746 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat15000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 183.5023558139801 seconds = 3.0583725968996682 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat15000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 139.69884252548218 seconds = 2.3283140420913697 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat15000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 188.8621747493744 seconds = 3.147702912489573 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat15000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------
Training and predicting took 278.14679527282715 seconds = 4.635779921213786 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat15000_ngram1-2
-



Training and predicting took 210.84810543060303 seconds = 3.5141350905100506 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat15000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 218.9772970676422 seconds = 3.649621617794037 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat15000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 170.23246026039124 seconds = 2.8372076710065204 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat15000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 190.18113946914673 seconds = 3.169685657819112 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat15000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------
Training and predicting took 276.18185925483704 seconds = 4.603030987580618 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat15000_ngram1-3




Training and predicting took 209.92146492004395 seconds = 3.4986910820007324 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat20000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 187.19214963912964 seconds = 3.1198691606521605 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat20000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 226.0203981399536 seconds = 3.7670066356658936 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat20000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 238.46292853355408 seconds = 3.974382142225901 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat20000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------
Training and predicting took 259.73858666419983 seconds = 4.32897644440333 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat20000_ngram1-1
-



Training and predicting took 223.1161277294159 seconds = 3.718602128823598 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat20000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 250.75555181503296 seconds = 4.179259196917216 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat20000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 316.42755937576294 seconds = 5.2737926562627155 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat20000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 260.4574625492096 seconds = 4.340957709153494 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat20000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------
Training and predicting took 311.7339015007019 seconds = 5.195565025011699 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat20000_ngram1-2
--



Training and predicting took 288.28871393203735 seconds = 4.804811898867289 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat20000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 273.7835040092468 seconds = 4.563058400154114 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat20000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 271.4182462692261 seconds = 4.523637437820435 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat20000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 286.05994272232056 seconds = 4.767665712038676 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat20000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------
Training and predicting took 310.58805441856384 seconds = 5.17646757364273 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat20000_ngram1-3
-



Training and predicting took 159.08541083335876 seconds = 2.6514235138893127 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat5000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 122.65613961219788 seconds = 2.0442689935366314 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat5000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 102.7226836681366 seconds = 1.7120447278022766 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat5000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 187.9498119354248 seconds = 3.1324968655904133 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat5000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------
Training and predicting took 149.94047570228577 seconds = 2.4990079283714293 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat5000_ngram1-1
-



Training and predicting took 161.42266988754272 seconds = 2.6903778314590454 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat5000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 224.28872323036194 seconds = 3.738145387172699 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat5000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 271.8219885826111 seconds = 4.530366476376852 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat5000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 214.63925671577454 seconds = 3.577320945262909 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat5000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------
Training and predicting took 152.52415490150452 seconds = 2.5420692483584086 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat5000_ngram1-2
-



Training and predicting took 196.17873239517212 seconds = 3.2696455399195354 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat5000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 222.3989293575287 seconds = 3.706648822625478 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat5000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 181.18943810462952 seconds = 3.019823968410492 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat5000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 187.56943702697754 seconds = 3.126157283782959 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat5000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------
Training and predicting took 152.1476240158081 seconds = 2.535793733596802 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat5000_ngram1-3
---



Training and predicting took 376.48346757888794 seconds = 6.274724459648132 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat40000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 334.135390996933 seconds = 5.568923183282217 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat40000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 465.05304479599 seconds = 7.750884079933167 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat40000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 425.5346779823303 seconds = 7.092244633038839 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat40000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------
Training and predicting took 294.24203634262085 seconds = 4.904033939043681 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat40000_ngram1-1
-



Training and predicting took 822.3914756774902 seconds = 13.706524594624836 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat40000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 818.2039477825165 seconds = 13.636732463041941 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat40000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 695.1858596801758 seconds = 11.586430994669596 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat40000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 461.2403612136841 seconds = 7.687339353561401 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat40000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------
Training and predicting took 398.961017370224 seconds = 6.649350289503733 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat40000_ngram1-2
---



Training and predicting took 762.8883590698242 seconds = 12.71480598449707 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat40000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 800.9847321510315 seconds = 13.349745535850525 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat40000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 766.9656949043274 seconds = 12.78276158173879 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat40000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 794.6851341724396 seconds = 13.244752236207326 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat40000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------
Training and predicting took 396.9746437072754 seconds = 6.616244061787923 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat40000_ngram1-3
-



Training and predicting took 393.7536368370056 seconds = 6.562560613950094 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat80000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 306.8454191684723 seconds = 5.114090319474538 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat80000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 391.0065109729767 seconds = 6.516775182882944 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat80000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------




Training and predicting took 405.34099674224854 seconds = 6.755683279037475 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat80000_ngram1-1
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9792 |          0.9844 |          0.9792 |          0.9786 |
--------------------------------------------------------------------------------------------
Training and predicting took 297.686514377594 seconds = 4.961441906293233 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat80000_ngram1-1
--



Training and predicting took 946.0853202342987 seconds = 15.768088670571645 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat80000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 847.98859167099 seconds = 14.1331431945165 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat80000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 880.1042692661285 seconds = 14.66840448776881 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat80000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 775.1935770511627 seconds = 12.919892950852711 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat80000_ngram1-2
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------
Training and predicting took 492.8400356769562 seconds = 8.214000594615936 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat80000_ngram1-2
-



Training and predicting took 942.0208532810211 seconds = 15.700347554683685 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat80000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 978.7582821846008 seconds = 16.312638036410014 minutes
Model: LinearSVC
Description: C=10, crammer-singer, max_it 5000
Pre-processing: tfidf_feat80000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 973.1053030490875 seconds = 16.21842171748479 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 5000
Pre-processing: tfidf_feat80000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------




Training and predicting took 1028.3242135047913 seconds = 17.13873689174652 minutes
Model: LinearSVC
Description: C=20, crammer-singer, max_it 50000
Pre-processing: tfidf_feat80000_ngram1-3
--------------------------------------------------------------------------------------------
|                ||         Accuracy |       Precision |          Recall |        F1-Score |
--------------------------------------------------------------------------------------------
|train           ||           1.0000 |          1.0000 |          1.0000 |          1.0000 |
|validation      ||           0.9917 |          0.9938 |          0.9917 |          0.9914 |
|test            ||           0.9833 |          0.9875 |          0.9833 |          0.9829 |
--------------------------------------------------------------------------------------------
Training and predicting took 490.61474418640137 seconds = 8.176912403106689 minutes
Model: NuSVC
Description: default
Pre-processing: tfidf_feat80000_ngram1-3


NameError: name 'minmutes' is not defined

# Run these after, less important

In [None]:
# Parameter grid
param_grid = {
    'tfidf__max_features': [10000, 15000, 20000, 5000, 40000, 80000],
    'tfidf__ngram_range': [(2, 2), (2, 3), (3, 3)]
}
iteration_times = []
# Create all combinations of parameters
param_combos = list(itertools.product(param_grid['tfidf__max_features'],
                            param_grid['tfidf__ngram_range']))
overall_start = time.time()
for max_feat, ngram_range in param_combos:
    iter_start = time.time()
    Tfidf_vect = TfidfVectorizer(
    stop_words='english', # Removes a lot of common english words like it, and, that, is etc. Uses predifined scikit list of common english words.
    sublinear_tf=True, # Uses logarithmic word frequency weighting, reducing the weight of extremely frequent terms & helps prevent domination by larger text files
    max_features=max_feat, # Consideration for both overfitting and computational requirements.
    ngram_range=ngram_range
    )
    
    Tfidf_vect.fit_transform(train_df[x_col])
    
    Train_X_Tfidf = Tfidf_vect.transform(Train_X)
    Test_X_Tfidf = Tfidf_vect.transform(Test_X)
    Val_X_Tfidf = Tfidf_vect.transform(Val_X)
    preproc_desc = f'tfidf_feat{max_feat}_ngram{ngram_range[0]}-{ngram_range[1]}'
    outfile=f'/home/dean/Documents/gitRepos/gutenberg_corpus_analysis/SVM/results_full_text_{preproc_desc}.csv'
    train_those_models(Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, preproc_desc)

    iter_end = time.time()

    iteration_times.append((max_feat, ngram_range, iter_end-iter_start))

overall_end=time.time()

total_time = overall_end - overall_start

print(f'This huge fucking run took {total_time/60} minutes')
print(f'That is {total_time/60/60} hours')

print('The iteration times were:')
for max_feat, ngram_range, seconds in iteration_times:
    minutes = seconds/60
    hours = minmutes / 60
    print(f'Features: {max_feat}  ngram: {ngram_range}  minutes: {minutes} = {hours} hours')
      