# Compares different chunking variations as well as different TF-IDF Vectorization parameters, using lemmatized text

Loops through all combinations of:

**Num Chunks**: 10000, 15000, 20000, 5000, 40000, 80000,

**Chunk Size**: (1,1), (1,2), (1,3), (2, 2), (2, 3), (3, 3)]

**TF-IDF Max Features**: 5000, 10000, 15000, 20000

**TF-IDF N-Grams**: (1,1), (1,2), (1,3)]


In [2]:
import os
import time
import tqdm
import random
import itertools

from tqdm.contrib.concurrent import process_map


import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm, neural_network
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, classification_report, make_scorer)
import multiprocessing as mp

from src.data_loader import GutenbergDataLoader

%load_ext autoreload
%autoreload 2

In [3]:
repos_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

In [6]:
gutenberg_repo_path = os.path.join(repos_path, 'gutenberg')
gutenberg_analysis_repo = os.path.join(repos_path, 'gutenberg-analysis')

src_dir = os.path.join(gutenberg_analysis_repo,'src')
sys.path.append(src_dir)
from data_io import get_book


gutenberg_src_dir = os.path.join(gutenberg_repo_path,'src')
sys.path.append(gutenberg_src_dir)

In [8]:
text_fold=os.path.join(gutenberg_repo_path, 'data', 'text')

In [10]:
dataset='nikita_dataset'

In [14]:
gdl = GutenbergDataLoader(dataset)

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects
2439,PG12810,"Uncle Sam's Boys with Pershing's Troops: Or, D...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],78,"{'World War, 1914-1918 -- Juvenile fiction', '..."
2446,PG12819,"Dick Prescott's Second Year at West Point: Or,...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],94,{'United States Military Academy -- Juvenile f...
25920,PG40605,"The Motor Boat Club at Nantucket; or, The Myst...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],189,"{'Motorboats -- Juvenile fiction', 'Nantucket ..."
55435,PG8153,"The Young Engineers in Arizona; or, Laying Tra...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],190,"{'Civil engineers -- Fiction', 'Arizona -- Fic..."
32899,PG48863,"The Motor Boat Club off Long Island; or, A Dar...","Hancock, H. Irving (Harrie Irving)",1868.0,1922.0,['en'],85,"{'Motorboats -- Juvenile fiction', 'Long Islan..."


In [None]:
train_df = gdl.train_df.copy()
val_df = gdl.val_df.copy()
test_df = gdl.test_df.copy()

In [27]:
def lemmatize_that_thang(tokenized_text):
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV    
    
    # Declaring Empty List to store the words that follow the rules for this step
    final_words = []
    # Initializing WordNetLemmatizer()
    word_lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(tokenized_text):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_lemmatized.lemmatize(word,tag_map[tag[0]])
            final_words.append(word_final)
    return str(final_words)
    # The final processed set of words for each iteration will be stored in 'text_final'

In [31]:
def test_one_chunk( num_chunks, chunk_size, x_col='text', tokenized_col='tokenized'):
    print(f'Starting {num_chunks} chunks of size {chunk_size}')
    print('making random chunks')
    train_df['chunks'] = train_df['text'].apply(lambda x: make_random_chunks(x, num_chunks=num_chunks, chunk_size = chunk_size, overlap=False))
    test_df['chunks'] = test_df['text'].apply(lambda x: make_random_chunks(x, num_chunks=num_chunks, chunk_size = chunk_size, overlap=False))
    val_df['chunks'] = val_df['text'].apply(lambda x: make_random_chunks(x, num_chunks=num_chunks, chunk_size = chunk_size, overlap=False))
    print('finished making random chunks')
    print('beginning tokenization')

    tokenized = process_map(word_tokenize, train_df['chunks'], max_workers=11, chunksize=5)
    train_df[tokenized_col] = tokenized
    tokenized = process_map(word_tokenize, val_df['chunks'], max_workers=11, chunksize=5)
    val_df[tokenized_col] = tokenized
    tokenized = process_map(word_tokenize, test_df['chunks'], max_workers=11, chunksize=5)
    test_df[tokenized_col] = tokenized
    print('finished tokenization')

    print('beginning lemmatization')

    lemmatized = process_map(lemmatize_that_thang, train_df['tokenized'], max_workers=11, chunksize=5)
    train_df['lemmatized'] = lemmatized

    lemmatized = process_map(lemmatize_that_thang, val_df['tokenized'], max_workers=11, chunksize=5)
    val_df['lemmatized'] = lemmatized

    lemmatized = process_map(lemmatize_that_thang, test_df['tokenized'], max_workers=11, chunksize=5)
    test_df['lemmatized'] = lemmatized

    print('finished lemmatization')
    param_grid = {
    'tfidf__max_features': [5000, 10000, 15000, 20000],#, 40000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1,3)]
    }
    iteration_times = []
    # Create all combinations of parameters
    param_combos = list(itertools.product(param_grid['tfidf__max_features'],
                            param_grid['tfidf__ngram_range']))
    outfold='/home/dean/Documents/gitRepos/gutenberg_corpus_analysis/SVM/chunk_results'
    for max_feat, ngram_range in param_combos:
        test_one_tfidf_combo(max_feat, ngram_range, outfold, f'numChunk{num_chunks}_chSize{chunk_size}')
        




In [33]:
def stem_that_thang(tokenized_text):
    # Declaring Empty List to store the words that follow the rules for this step
    final_words = []
    # Initializing PorterStemmer()
    ps = PorterStemmer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word in tokenized_text:
        if word not in stopwords.words('english'):
            word_final = ps.stem(word)
            final_words.append(word_final)

    return str(final_words)
    # The final processed set of words for each iteration will be stored in 'text_final'

In [35]:
def test_one_tfidf_combo(max_feat, ngram_range, outfold, other_desc):
    Tfidf_vect = TfidfVectorizer(
    stop_words='english', # Removes a lot of common english words like it, and, that, is etc. Uses predifined scikit list of common english words.
    sublinear_tf=True, # Uses logarithmic word frequency weighting, reducing the weight of extremely frequent terms & helps prevent domination by larger text files
    max_features=max_feat, # Consideration for both overfitting and computational requirements.
    ngram_range=ngram_range
    )

    x_col='chunks'
    
    print('performing tfidf vectorization')
    Tfidf_vect.fit_transform(train_df[x_col])

    Train_X, Train_Y = train_df[x_col], train_df['author']
    Test_X, Test_Y = test_df[x_col], test_df['author']
    Val_X, Val_Y = val_df[x_col], val_df['author']
    
    Train_X_Tfidf = Tfidf_vect.transform(Train_X)
    Test_X_Tfidf = Tfidf_vect.transform(Test_X)
    Val_X_Tfidf = Tfidf_vect.transform(Val_X)
    preproc_desc = f'tfidf_feat{max_feat}_ngram{ngram_range[0]}-{ngram_range[1]}'
    outfile=os.path.join(outfold, f'results_{other_desc}_{preproc_desc}.csv')
    train_those_models(Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                        outfile, preproc_desc)


# Lemmatization, English stop words

In [39]:
#print(Tfidf_vect.vocabulary_)

In [41]:
def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test, model_description, preproc_desc, verbose=False):
    # Train and predict
    start = time.time()
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    end = time.time()

    print(f'Training and predicting took {end-start} seconds = {(end-start)/60} minutes')

    results={}
    for label, y_truth, y_pred in [('train', y_train, y_train_pred), 
                            ('validation', y_val, y_val_pred),
                            ('test', y_test, y_test_pred)]:
        # Metrics (set zero_division=0 to silence warnings)
        acc = accuracy_score(y_truth, y_pred)
        f1 = f1_score(y_truth, y_pred, average='weighted', zero_division=0)
        precision = precision_score(y_truth, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_truth, y_pred, average='weighted', zero_division=0)

        result_dict = {'accuracy': acc,
                       'precision': precision,
                       'recall' : recall,
                       'f1' : f1}
        results[label] = result_dict
        
        
    # Print performance
    if verbose:
        print(f"Model: {model.__class__.__name__}")
        print(f'Description: {model_description}')
        print(f'Pre-processing: {preproc_desc}')
        label_str=f'|{'':<15} ||  {'Accuracy':>15} | {'Precision':>15} | {'Recall':>15} | {'F1-Score':>15} |'
        print("-" * len(label_str))

        print(label_str)
        print("-" * len(label_str))

        for result_label, sub_res_dict in results.items():
            output_str = f'|{result_label:<15} || '
            
            for key, val in sub_res_dict.items():
                output_str += f' {val:15.4f} |'
            print(output_str)
    
        print("-" * len(label_str))

    new_res_df = results_to_df(model.__class__.__name__, model_description, preproc_desc, results)
    
    # if os.path.exists(res_file):
    #     old_res_df = pd.read_csv(res_file)
    #     old_res_df.set_index(['model_type', 'description', 'preprocessing description', 'metric'], inplace=True)
    
    #     res_df = pd.concat([old_res_df, new_res_df])
    #     res_df.to_csv(res_file)
    # else:
    #     new_res_df.to_csv(res_file)
        

    return new_res_df

In [43]:
def results_to_df(model_type, model_desc, preproc_desc, result_dict):
    res_df = pd.DataFrame.from_dict(result_dict)
    res_df['model_type'] = model_type
    res_df['description'] = model_desc
    res_df['preprocessing description'] = preproc_desc
    res_df.reset_index(inplace=True)
    res_df.rename({'index':'metric'}, axis=1, inplace=True)
    res_df.set_index(['model_type', 'description', 'preprocessing description', 'metric'], inplace=True)

    return res_df
    

In [47]:
def train_those_models(Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, 
                                    outfile, preprocessing_description):

    models_to_eval = [
        (svm.SVC(), 'default_settings'),
        (svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto'), 'C=1, kernel Linear, deg 3, gamma auto'),
        (svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='scale'), 'C=1, kernel rbf, deg 3 gamma scale'),
        (svm.SVC(C=1.0, kernel='poly', degree=2, gamma='scale'), 'C=1, kernel poly, deg 2 gamma scale'),
        (svm.SVC(C=1.0, kernel='poly', degree=3, gamma='scale'), 'C=1, kernel poly, deg 3 gamma scale'),
        (svm.SVC(C=1.0, kernel='poly', degree=4, gamma='scale'), 'C=1, kernel poly, deg 4 gamma scale'),
        (svm.SVC(C=1.0, kernel='sigmoid', degree=3, gamma='scale'), 'C=1, kernel sigmoid, deg 3 gamma scale'),
        (svm.LinearSVC(), 'default'),
        #(svm.LinearSVC(C=10), 'C=10'),
        #(svm.LinearSVC(C=10, max_iter=5000), 'C=10, max_it 5000'),
        (svm.LinearSVC(C=10, max_iter=25000), 'C=10, max_it 25000'),
        #(svm.LinearSVC(C=20, max_iter=5000), 'C=20, max_it 5000'),
        (svm.LinearSVC(C=20, max_iter=25000), 'C=20, max_it 25000'),
        (svm.LinearSVC(C=50, max_iter=25000), 'C=50, max_it 25000'),
        #(svm.LinearSVC(C=10, multi_class='crammer_singer', max_iter=5000), 'C=10, crammer-singer, max_it 5000'),
        (svm.LinearSVC(C=10, multi_class='crammer_singer', max_iter=25000), 'C=10, crammer-singer, max_it 25000'),
       # (svm.LinearSVC(C=20, multi_class='crammer_singer', max_iter=5000), 'C=20, crammer-singer, max_it 5000'),
        (svm.LinearSVC(C=20, multi_class='crammer_singer', max_iter=25000), 'C=20, crammer-singer, max_it 25000'),
        (svm.NuSVC(), 'default'),
        (svm.NuSVC(kernel='linear'), 'nu=0.5, kernel linear'),
        (svm.NuSVC(nu=.75, kernel='linear'), 'nu=0.75, kernel linear'),
        (svm.NuSVC(nu=.25, kernel='linear'), 'nu=0.25, kernel linear'),
        (naive_bayes.MultinomialNB(), 'default'),
        (neural_network.MLPClassifier(), 'default'),
        (neural_network.MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000), 'hidden_layer_size 200, max it 1000'),
        (neural_network.MLPClassifier(activation='logistic', max_iter=500), 'logistic act, 500 iter')
        
        
    ]
    param_list = []
    for model, model_desc in models_to_eval:
        params = (model, Train_X_Tfidf, Train_Y, Val_X_Tfidf, Val_Y, Test_X_Tfidf, Test_Y, model_desc, preprocessing_description)
        param_list.append(params)

    with mp.Pool(11) as p:
        results = p.starmap(evaluate_model, param_list)

   res_df = pd.concat(results)
   res_df.to_csv(outfile)



In [None]:
chunk_grid = {
    'num_chunks':[1, 5, 10, 20, 50, 100],
    'chunk_size':[100, 250, 500, 1000, 2000]
}
chunk_list = list(itertools.product(chunk_grid['num_chunks'],
                            chunk_grid['chunk_size']))

for num_chunks, chunk_size in chunk_list:
    print(num_chunks, chunk_size)

    test_one_chunk( num_chunks, chunk_size, x_col='text', tokenized_col='tokenized')

