In [1]:
# Libs intialization
import sklearn as sk
import pandas as pd
import numpy as np
import math

from prettytable import PrettyTable
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [2]:
df = pd.read_csv('../Data/FilteredData.csv', sep= ';', index_col=False)
training_data, testing_data = train_test_split(df,random_state = 2000,)
Y_train=training_data['Class'].values
Y_test=testing_data['True class'].values
df.head(7)

Unnamed: 0,CleanedText,TokenizedText,TF-IDF,TF-IDF sum,Words quantity,TF-IDF mean,Class,True class
0,runner around world post photo wearing space r...,"['runner', 'runner', 'explorer', 'runner', 'se...",[1.43509015 3.43247292 0.53856938 8.20984681 0...,3.446952,25,2.83091,2,2
1,wfh time pfh read prachar politics home politi...,"['time', 'high', 'time', 'contestant', 'creati...",[0.55672677 0.53856938 0.77486174 0.53856938 0...,2.285503,20,0.589459,1,1
2,bts system last collection accessory product t...,"['product', 'two', 'hat', 'two', 'hat']",[0.53856938 0.88629027 0.95672677 1.96377609 2...,3.107692,14,1.326736,2,2
3,manohar parrikar always idol resigning party t...,"['parrikar', 'party', 'party', 'parrikar', 'ra...",[1.07713877 0.67713877 1.14629228 2.90056981 0...,3.272661,12,1.267942,1,1
4,day 1394 running min 10 km day total 16448 km ...,"['km', 'day', '16448', 'km', '03']",[1.07713877 0.26622382 0.53856938 2.90056981 0...,3.04975,13,1.064214,0,0
5,win italy 1 66 switzerland 6 0 draw 3 85,"['italy', '66', 'switzerland', 'draw', '85']",[0.53856938 0.53856938 0.53856938 0.53856938 0...,2.315468,10,0.538569,0,0
6,cannot reproduced easily economically able bri...,"['easily', 'bring', 'highest', 'shortest', 'po...",[0.53856938 0.53856938 0.53856938 0.53856938 0...,2.374296,11,0.538569,2,2


In [3]:
# TF-IDF Transformation
tf_idf=TfidfVectorizer(use_idf=True, max_df=0.95)
tf_idf.fit_transform(training_data['CleanedText'].values)
X_train=tf_idf.transform(training_data['CleanedText'].values)
X_test=tf_idf.transform(testing_data['CleanedText'].values)

In [4]:
# Logistic Regression
scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2', max_iter=1000)
model=scikit_log_reg.fit(X_train,Y_train)
probs = model.predict_proba(X_test)
best_n = np.argsort(probs, axis=1)[:,-3:]
preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]
preds=[ item[::-1] for item in preds]

[LibLinear]iter  1 act 3.466e+02 pre 3.288e+02 delta 1.259e+00 f 1.279e+03 |g| 5.521e+02 CG   1
cg reaches trust region boundary
iter  2 act 5.509e+01 pre 5.443e+01 delta 3.429e+00 f 9.323e+02 |g| 6.922e+01 CG   2
cg reaches trust region boundary
iter  3 act 1.201e+02 pre 1.220e+02 delta 9.767e+00 f 8.772e+02 |g| 4.821e+01 CG   2
cg reaches trust region boundary
iter  4 act 2.065e+02 pre 2.017e+02 delta 1.321e+01 f 7.571e+02 |g| 6.871e+01 CG   2
iter  5 act 2.886e+01 pre 2.817e+01 delta 1.321e+01 f 5.506e+02 |g| 2.645e+01 CG   2
iter  6 act 4.646e-01 pre 4.630e-01 delta 1.321e+01 f 5.217e+02 |g| 5.867e+00 CG   2
iter  7 act 2.762e-02 pre 2.760e-02 delta 1.321e+01 f 5.213e+02 |g| 3.510e-01 CG   3
iter  8 act 1.575e-04 pre 1.576e-04 delta 1.321e+01 f 5.212e+02 |g| 2.576e-02 CG   3
iter  1 act 3.803e+02 pre 3.592e+02 delta 1.322e+00 f 1.279e+03 |g| 5.771e+02 CG   1
cg reaches trust region boundary
iter  2 act 6.082e+01 pre 5.998e+01 delta 3.513e+00 f 8.986e+02 |g| 7.580e+01 CG   2
cg reac

In [5]:
# Making a Comparative List of Truth and Predictions
eval_items=[[[Y_test[idx]],pred] for idx,pred in enumerate(preds)]

In [6]:
# Calculation of accuracy and MRR
correct=0
for item in eval_items:
    true_pred=item[0]
    machine_pred=set(item[1])
    for cat in true_pred:
        if cat in machine_pred:
            correct+=1
            break
accuracy=correct/float(len(eval_items))

rr_total = 0
for item in eval_items:   

    # add index to list only if machine predicted label exists in true labels
    tp_pos_list = [(idx + 1) for idx, r in enumerate(item[1]) if r in item[0]]

    rr_at_k = 0
    if len(tp_pos_list) > 0:
        # for RR we need position of first correct item
        first_pos_list = tp_pos_list[0]
    
        # rr = 1/rank
        rr_at_k = 1 / float(first_pos_list)
    rr_total = rr_total + rr_at_k
    mrr = rr_total / 1/float(len(eval_items))
    

In [7]:
print('Model=', model, ' Accuracy=', accuracy, ' MRR=', mrr)

Model= LogisticRegression(C=5, max_iter=1000, random_state=0, solver='liblinear',
                   verbose=1)  Accuracy= 0.8709677419354839  MRR= 0.7580645161290321
