In [1]:
# Libs intialization
import sklearn as sk
import pandas as pd
import numpy as np
import math

from prettytable import PrettyTable
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
df = pd.read_csv('Data/FilteredData.csv', sep= ';', index_col=False)
training_data, testing_data = train_test_split(df,random_state = 2000)
Y_train=training_data['Class'].values
Y_test=testing_data['True class'].values
df.head(7)

Unnamed: 0,CleanedText,TokenizedText,Words quantity,Class,True class
0,cut thrust ash cricket beer reminisce cricket ...,"['cut', 'thrust', 'ash', 'cricket', 'beer', 'r...",8,0,0
1,win italy 1 66 switzerland 6 0 draw 3 85,"['win', 'italy', '1', '66', 'switzerland', '6'...",10,0,0
2,laporta becomes member rfef council,"['laporta', 'becomes', 'member', 'rfef', 'coun...",5,0,0
3,done eliud kipchoge achieves impossible run ma...,"['done', 'eliud', 'kipchoge', 'achieves', 'imp...",15,0,0
4,know first time said 24 page single mention wo...,"['know', 'first', 'time', 'said', '24', 'page'...",12,0,0
5,finish pencil work anthony oluwafemi olaseni j...,"['finish', 'pencil', 'work', 'anthony', 'oluwa...",24,0,0
6,greeting sport industry award,"['greeting', 'sport', 'industry', 'award']",4,0,0


In [10]:
# TF-IDF Transformation
tf_idf=TfidfVectorizer(use_idf=True, max_df=0.95)
tf_idf.fit_transform(training_data['CleanedText'].values)
X_train=tf_idf.transform(training_data['CleanedText'].values)
X_test=tf_idf.transform(testing_data['CleanedText'].values)

In [4]:
# Calculation of accuracy
def acc_C(eval_items):
    correct=0
    for item in eval_items:
        true_pred=item[0]
        machine_pred=set(item[1])
        for cat in true_pred:
            if cat in machine_pred:
                correct+=1
                break
    accuracy=(2*correct + 3*len(eval_items))/(5*len(eval_items))

    rr_total = 0
    for item in eval_items:   

        # add index to list only if machine predicted label exists in true labels
        tp_pos_list = [(idx + 1) for idx, r in enumerate(item[1]) if r in item[0]]

        rr_at_k = 0
        if len(tp_pos_list) > 0:
            # for RR we need position of first correct item
            first_pos_list = tp_pos_list[0]
        
            # rr = 1/rank
            rr_at_k = 1 / float(first_pos_list)
        rr_total = rr_total + rr_at_k
        mrr = rr_total / 1/float(len(eval_items))

    return accuracy, mrr

In [5]:
result  = []
for i in range (0, 5):
    if i == 0:
        # Logistic Regression
        model = LogisticRegression(verbose=1, solver='newton-cg',random_state=0, C=5, penalty='l2', max_iter=1000)
    elif i == 1:
        # KNN
        model = KNeighborsClassifier (n_neighbors=7)    
    elif i == 2:
        # Random forest
        model = RandomForestClassifier (n_estimators=200)
    elif i == 3:
        # Naïve Bayes
        model = MultinomialNB ()
    elif i == 4:
        # Boosting
        model = GradientBoostingClassifier (n_estimators=200, random_state=0, learning_rate=1.0, max_depth=1)             
    model=model.fit(X_train,Y_train)
    probs = model.predict_proba(X_test)
    best_n = np.argsort(probs, axis=1)[:,-2:]
    preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]
    preds=[ item[::-1] for item in preds]
    # Making a Comparative List of Truth and Predictions
    eval_items=[[[Y_test[idx]],pred] for idx,pred in enumerate(preds)]
    result.append([model, acc_C(eval_items)[0], acc_C(eval_items)[1]])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [6]:
eval_items = []
for i in range(0, len(df)):
    a = df.Class[i]
    b = df['True class'][i]
    eval_items.append([[a], [b]])

table = PrettyTable()
table.field_names = ["Model", "Accuracy", "MRR"]
table.add_row(['Data', "{0:.3f}%".format(acc_C(eval_items)[0]*100), "{0:.3f}%".format(acc_C(eval_items)[1]*100)])
for i in range (0,5):
    table.add_row([str(result[i][0]).split('(')[0], "{0:.3f}%".format(result[i][1]*100), "{0:.3f}%".format(result[i][2]*100)])
print(table)

+----------------------------+----------+---------+
|           Model            | Accuracy |   MRR   |
+----------------------------+----------+---------+
|            Data            | 97.485%  | 93.712% |
|     LogisticRegression     | 90.323%  | 65.726% |
|    KNeighborsClassifier    | 89.032%  | 58.871% |
|   RandomForestClassifier   | 87.742%  | 55.645% |
|       MultinomialNB        | 90.645%  | 64.919% |
| GradientBoostingClassifier | 87.742%  | 63.306% |
+----------------------------+----------+---------+
