# ML Excursus
## (Documentation of my Dissertation "The Notion of *surb* in Ancient Armenian Texts from the Fifth Century CE")

This notebook includes several performance tests for different ML classifiers of the annotated Twitter Data from the dissertation "The Notion of *surb* in the Ancient Armenian Church" (Thomas Jurczyk). The data itself is not part of the GitHub repository. This file only serves as a documentation of the code used to create and test the different ML classifiers. 

In [None]:
# loading modules
import pandas as pd
import matplotlib.pyplot as plt
# importing classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# functions for the evaluation of the ML models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
# other modules
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import linspace
from matplotlib import cm

## 1) Loading and Preparing Annotated Twitter Data for the ML Models

In [None]:
twitter_annotated = pd.read_csv("annotated_data/full_data_revised.csv", sep=";")
# reducing frame to annotated data
twitter_annotated = twitter_annotated.iloc[:3250]
twitter_annotated["RON_revised"] = twitter_annotated["RON_revised"].astype("int")

In [None]:
print("Shape: " + str(twitter_annotated.shape) + "\n")
twitter_annotated.head()

Preparing the (vectorized) frames for the ML models.

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(twitter_annotated["Text"])
y = twitter_annotated["RON_revised"]

In [None]:
# storing approximately 25% of the frame in a later test set X_test, y_test; the rest is stored in X_train, y_train and used for the training of the ML models

In [None]:
X_train, y_train, X_test, y_test = X[:2438], y.iloc[:2438], X[2438:], y.iloc[2438:]

## 2) Training and Testing of the Selected ML Models
The evaluation only includes evaluations of recall, precision, and the F<sub>1</sub> score for each model. There is neither a PR/TR curve nor a ROC curve to optimize a threshold of certain models, such as the SGDClassifier. Firstly, because optimization of thresholds goes beyond the simple use of ML in this thesis. Secondly, instance-based models such as KNN have no thresholds but would need a variation of the k-value.

In [None]:
def printScores(y_orig, y_predict):
    '''
        helper function for evaluateModels()
    '''
    print("Precision Score: " + str(precision_score(y_orig, y_predict)) + "\n")
    print("Recall: " + str(recall_score(y_orig, y_predict)) + "\n")
    print("F_1 Score: "  + str(f1_score(y_orig, y_predict)) + "\n")
    print("Confusion matrix:\n"  + str(confusion_matrix(y_orig, y_predict)) + "\n")
    

In [None]:
def evaluateModelsCV(model_list, train_data, label_data):
    best_model = {"best_precision": ["model", 0], "best_recall": ["model", 0], "best_f1": ["model", 0]}
    for model in model_list:
        y_train_predict = cross_val_predict(model, train_data, label_data, cv=3)
        precision = precision_score(label_data, y_train_predict)
        recall = recall_score(label_data, y_train_predict)
        f1_ = f1_score(label_data, y_train_predict)
        # if scores are better than scores in the best_models dict, add model + value to the corresponding category in the dict()
        if precision > best_model["best_precision"][1]:
            best_model["best_precision"][0] = str(model)
            best_model["best_precision"][1] = precision
        if recall > best_model["best_recall"][1]:
            best_model["best_recall"][0] = str(model)
            best_model["best_recall"][1] = recall
        if f1_ > best_model["best_f1"][1]:
            best_model["best_f1"][0] = str(model)
            best_model["best_f1"][1] = f1_
        print("Model: " + str(model) + "\n")
        printScores(label_data, y_train_predict)
    return best_model
        

In [None]:
# initialising classifiers
sgd_clf = SGDClassifier(random_state=42)
knn_clf = KNeighborsClassifier()
forest_clf = RandomForestClassifier(random_state=42, n_estimators=100)
svc_clf = SVC()
logReg_clf = LogisticRegression()

# creating list for function
ml_models_list = [sgd_clf, knn_clf, forest_clf, svc_clf, logReg_clf]

In [None]:
# using function and creating dict with best models for precision, recall, and f1_score
dict_best_models = evaluateModelsCV(ml_models_list, X_train, y_train)

In [None]:
dict_best_models

## 3) Final Test with the Test Data
After the cross-validation in the last step, the final evaluation with the help of the test set is conducted in the following part.

In [None]:
def evaluateModelsFinal(model_list, train_data, train_label, test_data, test_label):
    '''
        evaluating test set with different trained ML classifiers; returning dict with best performing models for precision, recall, f1 and an overall dict with all classifiers plus their stats
    '''
    best_model = {"best_precision": ["model", 0], "best_recall": ["model", 0], "best_f1": ["model", 0]}
    overall_stats_dict = dict()
    for model in model_list:
        model_trained = model.fit(train_data, train_label)
        y_test_predict = model_trained.predict(test_data) 
        precision = precision_score(test_label, y_test_predict)
        recall = recall_score(test_label, y_test_predict)
        f1_ = f1_score(test_label, y_test_predict)
        # if scores are better than scores in the best_models dict, add model + value to the corresponding category in the dict()
        if precision > best_model["best_precision"][1]:
            best_model["best_precision"][0] = str(model)
            best_model["best_precision"][1] = precision
        if recall > best_model["best_recall"][1]:
            best_model["best_recall"][0] = str(model)
            best_model["best_recall"][1] = recall
        if f1_ > best_model["best_f1"][1]:
            best_model["best_f1"][0] = str(model)
            best_model["best_f1"][1] = f1_
        print("Model: " + str(model) + "\n")
        printScores(test_label, y_test_predict)
        # adding values to the overall dict
        classifier_name = str(model).split("(")[0]
        overall_stats_dict[classifier_name] = [precision, recall, f1_]
    return (best_model, overall_stats_dict)
        

In [None]:
final_best_dict, final_classifiers_dict  = evaluateModelsFinal(ml_models_list, X_train, y_train, X_test, y_test)

## 4) Plotting the results

In [None]:
pd.set_option('max_colwidth', -1)
plt.style.use('fivethirtyeight')
colors = [cm.jet(x) for x in linspace(0.0,1.0,20)]
colors2 = cm.tab20.colors

In [None]:
classifiers_x = list()
f1_scores_y = list()
for name, measures in final_classifiers_dict.items():
    classifiers_x.append(name)
    f1_scores_y.append(measures[2])
#create df
df_classifiers_f1_plot = pd.DataFrame()
df_classifiers_f1_plot["Classifier"] = classifiers_x
df_classifiers_f1_plot["f1_scores"] = f1_scores_y
# dropping "SVC" due to 0 value; why it returns a zero should be looked into later
df_classifiers_f1_plot.drop(3,inplace=True)
df_classifiers_f1_plot.set_index("Classifier", drop=True, inplace=True)
df_classifiers_f1_plot

In [None]:
plt.figure(figsize=(8,8))
bars = df_classifiers_f1_plot.plot(kind="bar", legend=False, ax=plt.gca())
for num, patch in enumerate(bars.patches):
    patch.set_color(colors2[num])
plt.tight_layout()
plt.gca().set_xlabel("")
plt.savefig("current_fig.png")