In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Data/lowercase.csv')
df.head(3)

Unnamed: 0,PairID,QueryID,Comment,Query,Score,WordCountComment,WordCountQuery,MutualUnique,MutualWithRepetition,BOW
0,BookStackApp_BookStack_ActivityService_740,0,daj novu instancu aktivnosti za trenutnog kori...,red sa prioritetom,0,7,3,0,0,0.0
1,BookStackApp_BookStack_ActivityService_740,1,daj novu instancu aktivnosti za trenutnog kori...,pretvaranje string u datum,0,7,4,0,0,0.0
2,BookStackApp_BookStack_ActivityService_740,2,daj novu instancu aktivnosti za trenutnog kori...,sortiranje string liste,0,7,3,0,0,0.0


In [33]:
X = df.iloc[:, 5:]
y = df.iloc[:, 4:5]                              # y = df['Score']
print(y.head(3))
print(X.head(3))

   Score
0      0
1      0
2      0
   WordCountComment  WordCountQuery  MutualUnique  MutualWithRepetition  BOW
0                 7               3             0                     0  0.0
1                 7               4             0                     0  0.0
2                 7               3             0                     0  0.0


In [36]:
y.Score.unique()

array([0, 1, 2, 3], dtype=int64)

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_curve, auc

import matplotlib.pyplot as plt

In [48]:
models = {
    'MNB': MultinomialNB(),
    'LR': LogisticRegression(solver='liblinear'), #LogisticRegression(solver='liblinear', random_state=0, class_weight='balanced')
    'SVM': SVC()                                  #SVC(C = 2, gamma='auto', class_weight='balanced')
}

In [49]:
# Train - ovo nece trebati, jer gore u models dictionary ubacis istrenirane modele
for model_name, clf in models.items():

    print(f'Train model: {model_name}')
    
    models[model_name] = clf.fit(X, y)

Train model: MNB
Train model: LR
Train model: SVM


In [50]:
models

{'MNB': MultinomialNB(),
 'LR': LogisticRegression(solver='liblinear'),
 'SVM': SVC()}

In [58]:
def model_report(X, y, models, data_type):

    report_df = pd.DataFrame(columns=['DataType', 'Model Name', 'precision_micro', 'precision_macro', 'precision_weighted', 
                                    'recall_micro', 'recall_macro', 'recall_weighted', 'F1_micro', 'F1_macro', 'F1_weighted',
                                    'Accuracy', 'auc_class_0', 'auc_class_1', 'auc_class_2', 'auc_class_3'])

    n_classes = len(set(y)) # number of classes

    for model_name, clf in models.items():
        
        # Train Model
        # clf = clf.fit(X, y)

        # Predict 
        y_pred = clf.predict(X)


        # Scores
        precision_score_micro = precision_score(y, y_pred, average='micro')
        precision_score_macro = precision_score(y, y_pred, average='macro')
        precision_score_weighted = precision_score(y, y_pred, average='weighted')

        recall_score_micro = recall_score(y, y_pred, average='micro')
        recall_score_macro = recall_score(y, y_pred, average='macro')
        recall_score_weighted = recall_score(y, y_pred, average='weighted')

        f1_score_micro = f1_score(y, y_pred, average='micro')
        f1_score_macro = f1_score(y, y_pred, average='macro')
        f1_score_weighted = f1_score(y, y_pred, average='weighted')

        accuracy = accuracy_score(y, y_pred)

        try:
            y_pred_proba = clf.predict_proba(X)
            fpr = dict()
            tpr = dict()
            roc_auc = dict()

            for i in range(n_classes):
                fpr[i], tpr[i], _ = roc_curve(y, y_pred_proba[:, i], pos_label=i) # y -> [1, 2, 0, 1 ...], y_pred_proba -> [[0.1, 0.0, 0.6, 0.3], [0.2, .0.5, 0.1, 0.2],...]
                roc_auc[i] = auc(fpr[i], tpr[i])

        except Exception as e:
            for i in range(n_classes):
                roc_auc[i] = None
                
        report_df.loc[len(report_df)] = [data_type, model_name, precision_score_micro, precision_score_macro, precision_score_weighted,
                                        recall_score_micro, recall_score_macro, recall_score_weighted,
                                    f1_score_micro, f1_score_macro, f1_score_weighted, accuracy, roc_auc[0], roc_auc[1], roc_auc[2], roc_auc[3]]
       
    # Plotting Roc
    plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Class 0 vs Rest')
    plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Class 1 vs Rest')
    plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Class 2 vs Rest')
    plt.plot(fpr[3], tpr[3], linestyle='--',color='yellow', label='Class 3 vs Rest')
    plt.title(f'Multiclass ROC curve {data_type}')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive rate')
    plt.legend(loc='best')
    plt.savefig(f'Multiclass ROC_{data_type}', dpi=300)
    plt.close()

    report_df = report_df.sort_values(by=['auc_class_0', 'auc_class_1', 'auc_class_2', 'auc_class_3', 'Accuracy']).reset_index(drop=True)

    return report_df

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y['Score'], test_size=0.25, random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

print(f'Train: {X_train.shape}')
print(f'Test: {X_test.shape}')
print(f'Valid: {X_valid.shape}')

report_train_df = model_report(X=X_train, y=y_train, models=models, data_type='train')
report_test_df = model_report(X=X_test, y=y_test, models=models, data_type='test')
report_valid_df = model_report(X=X_valid, y=y_valid, models=models, data_type='valid')
report_all_df = model_report(X=X, y=y['Score'], models=models, data_type='all_data')


report_df = pd.concat([report_train_df, report_test_df, report_valid_df, report_all_df], axis=0)
report_df

Train: (102312, 5)
Test: (17052, 5)
Valid: (17052, 5)


Unnamed: 0,DataType,Model Name,precision_micro,precision_macro,precision_weighted,recall_micro,recall_macro,recall_weighted,F1_micro,F1_macro,F1_weighted,Accuracy,auc_class_0,auc_class_1,auc_class_2,auc_class_3
0,train,MNB,0.981625,0.25018,0.989623,0.981625,0.26797,0.981625,0.981625,0.250432,0.985576,0.981625,0.641243,0.638006,0.602983,0.688041
1,train,LR,0.994712,0.498678,0.990361,0.994712,0.25266,0.994712,0.994712,0.2546,0.992085,0.994712,0.688009,0.679624,0.673817,0.742045
2,train,SVM,0.994702,0.248676,0.989433,0.994702,0.25,0.994702,0.994702,0.249336,0.992061,0.994702,,,,
0,test,MNB,0.982935,0.251188,0.991115,0.982935,0.2885,0.982935,0.982935,0.252181,0.986958,0.982935,0.676315,0.68162,0.659821,0.698528
1,test,LR,0.995426,0.248856,0.990872,0.995426,0.25,0.995426,0.995426,0.249427,0.993144,0.995426,0.701159,0.703219,0.693767,0.720794
2,test,SVM,0.995426,0.248856,0.990872,0.995426,0.25,0.995426,0.995426,0.249427,0.993144,0.995426,,,,
0,valid,MNB,0.984166,0.248976,0.991759,0.984166,0.24707,0.984166,0.984166,0.24802,0.987948,0.984166,0.65275,0.635264,0.664487,0.63149
1,valid,LR,0.995836,0.248959,0.99169,0.995836,0.25,0.995836,0.995836,0.249478,0.993759,0.995836,0.669584,0.634984,0.730523,0.698391
2,valid,SVM,0.995836,0.248959,0.99169,0.995836,0.25,0.995836,0.995836,0.249478,0.993759,0.995836,,,,
0,all_data,MNB,0.982106,0.250172,0.990077,0.982106,0.267591,0.982106,0.982106,0.250378,0.986045,0.982106,0.646466,0.642614,0.617882,0.682658
