# 1. Load Data

In [None]:
import pandas as pd
import os
import docx

In [None]:
data_dir = r"C:\Users\tubs\OneDrive\Desktop\Code\Heron Data\join-the-siege\files\synthesised_training_data"

file_list = []
for root, dirs, files in os.walk(data_dir):
    file_list.extend([os.path.join(root,file) for file in files])
file_list

In [None]:
len(file_list)

In [None]:
file_contents = {}
for file in file_list:
    if file.endswith('.txt'):
        with open(file, 'r') as f:
            file_contents[file] = f.read()
    if file.endswith('.docx'):
        doc = docx.Document(file)
        word_text = '\n'.join([p.text for p in doc.paragraphs])
        file_contents[file] = word_text
len(file_contents)

In [None]:
file_contents

In [None]:
files_df = pd.DataFrame.from_dict(file_contents, orient='index').reset_index()
columns = ['file_path', 'file_contents']
files_df.columns = columns
files_df

In [None]:
pd.set_option('display.max_colwidth', 125)

In [None]:
files_df['file_category'] = files_df['file_path'].apply(lambda x: os.path.split(os.path.split(x)[0])[1])
files_df

In [None]:
files_df['file_category'].value_counts()

In [None]:
files_df['file_category'].unique()

In [None]:
category_to_label_dict = {
    'acad_rep_docx': 'academic_report'
    , 'bus_rep_docx': 'business_report'
    , 'eml_txt': 'email'
    , 'fml_let_docx': 'formal_letter'
}

files_df['file_label'] = files_df['file_category'].apply(lambda x: category_to_label_dict[x])
files_df

# 2. Prepare Data

In [None]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
import pickle

In [None]:
contents_labels_df = files_df[['file_contents','file_label']]
columns = ['text','label']
contents_labels_df.columns = columns
contents_labels_df

In [None]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(contents_labels_df['text'], contents_labels_df['label'])

print(f'''
    Training Set:   {len(train_x)}
    Validation Set: {len(valid_x)}
''')

In [None]:
valid_y

In [None]:
encoder = preprocessing.LabelEncoder()
train_y_encoded = encoder.fit_transform(train_y)
valid_y_encoded = encoder.fit_transform(valid_y)

### Map Labels to Encoded Labels

In [None]:
encoding_zip = list(zip(valid_y_encoded.tolist(), list(valid_y)))
encoding_zip

In [None]:
encoding_map = dict(set(encoding_zip))
encoding_map

# 3. Feature Engineering

## 3.1. Count Vectors

In [None]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(contents_labels_df['text'])

# transform the training and validation data using count vectorizer object
train_x_count =  count_vect.transform(train_x)
valid_x_count =  count_vect.transform(valid_x)

## 3.2. TF-IDF Vectors

In [None]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(contents_labels_df['text'])
train_x_tfidf =  tfidf_vect.transform(train_x)
valid_x_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(contents_labels_df['text'])
train_x_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
valid_x_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(contents_labels_df['text'])
train_x_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
valid_x_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

# 4. Model Training

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    return metrics.accuracy_score(predictions, valid_y_encoded)

## 4.1. Naive Bayes

In [None]:
accuracy = train_model(naive_bayes.MultinomialNB(), train_x_count, train_y_encoded, valid_x_count)
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), train_x_tfidf, train_y_encoded, valid_x_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), train_x_tfidf_ngram, train_y_encoded, valid_x_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), train_x_tfidf_ngram_chars, train_y_encoded, valid_x_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)

## 4.2. Linear Regression

In [None]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), train_x_count, train_y_encoded, valid_x_count)
print("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), train_x_tfidf, train_y_encoded, valid_x_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), train_x_tfidf_ngram, train_y_encoded, valid_x_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), train_x_tfidf_ngram_chars, train_y_encoded, valid_x_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)

In [None]:
lr_classifier = linear_model.LogisticRegression()
lr_classifier.fit(train_x_tfidf, train_y_encoded)
validation_predictions = lr_classifier.predict(valid_x_tfidf)

print(f'''
Model Performance:
    Accuracy: {metrics.accuracy_score(validation_predictions, valid_y_encoded)}
    Precision: {metrics.precision_score(validation_predictions, valid_y_encoded, average='weighted')}
    Recall: {metrics.recall_score(validation_predictions, valid_y_encoded, average='weighted')}
    F1: {metrics.f1_score(validation_predictions, valid_y_encoded, average='weighted')}
''')

Model Performance:
- Accuracy: 0.9733333333333334
- Precision: 0.9754666666666667
- Recall: 0.9733333333333334
- F1: 0.973210922787194

# 5. Looking at Validation Predictions

In [None]:
import random

In [None]:
encoding_map

In [None]:
validation_contents = list(valid_x)
validation_contents

In [None]:
validation_predictions_ints = validation_predictions.tolist()
validation_predictions_ints

In [None]:
check_prediction = random.choice(range(0, len(validation_predictions)))

print(f'Prediction:\n{encoding_map[validation_predictions_ints[check_prediction]]}')
print(f'\nContents:\n{validation_contents[check_prediction]}')

# 6. Save

In [None]:
encoding_map

In [None]:
lr_classifier

In [None]:
model_file_name = '../../files/trained_models/logistic_regression_acc_97_3.sav'
pickle.dump(lr_classifier, open(model_file_name,'wb'))

In [None]:
encoding_map_file_name = '../../files/trained_models/encoding_map.sav'
pickle.dump(encoding_map, open(encoding_map_file_name,'wb'))