In [68]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_curve, auc
from sklearn.metrics import confusion_matrix


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from utility import classification
from datetime import datetime

In [65]:
import warnings
warnings.filterwarnings('ignore')

In [66]:
files = [
    'data/training/no_preprocessing.csv',
    'data/training/freq_filt.csv',
    'data/training/lowercase.csv',
    'data/training/ngrams.csv',
    'data/training/stem.csv',
    'data/training/tf_ponderisanje.csv',
]

C = [0.03, 0.1, 1, 10, 30]
C_SVC = [0.25, 1, 4]

classifiers = {
    'MNB': {name.split('/')[-1].split('.')[0]: [] for name in files},
    'LogReg': {name.split('/')[-1].split('.')[0]: {c: [] for c in C} for name in files},
    'SVM': {name.split('/')[-1].split('.')[0]: {c: [] for c in C_SVC} for name in files}
}

In [74]:
Classification_Results = pd.DataFrame(columns=['dataset', 'classifier', 'hyperparameter', 'metric', 'value'])

dataframes = {
    name.split('/')[-1].split('.')[0]: pd.read_csv(name) for name in files
}


In [None]:
for data in dataframes:
    y = dataframes[data]['Score'].copy()
    X = dataframes[data].drop(columns=['PairID', 'QueryID', 'Comment', 'Query', 'Score']).copy()
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    skf = StratifiedKFold(n_splits=10)
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Logistic regression
        for c in C:
            clf = LogisticRegression(
                penalty='l2', 
                C=c, 
                multi_class='multinomial', 
                solver='saga',
                class_weight='balanced'
            ).fit(X_train, y_train)

            classifiers['LogReg'][data][c].append(clf)

            y_pred = clf.predict(X_test)

            Classification_Results.loc[len(Classification_Results)] = [
                data, 'LogReg', c, 'recall', recall_score(y_test, y_pred, average='macro')
            ]
            Classification_Results.loc[len(Classification_Results)] = [
                data, 'LogReg', c, 'precision', precision_score(y_test, y_pred, average='macro')
            ]
            Classification_Results.loc[len(Classification_Results)] = [
                data, 'LogReg', c, 'fscore', f1_score(y_test, y_pred, average='macro')
            ]
        print('Done with LogReg')
        
#         # MNB
#         clf = MultinomialNB().fit(X_train, y_train)
#         classifiers['MNB'][data].append(clf)
#         y_pred = clf.predict(X_test)
#         Classification_Results.loc[len(Classification_Results)] = [
#             data, 'MNB', 0, 'recall', recall_score(y_test, y_pred, average='macro')
#         ]
#         Classification_Results.loc[len(Classification_Results)] = [
#             data, 'MNB', 0, 'recall', precision_score(y_test, y_pred, average='macro')
#         ]
#         Classification_Results.loc[len(Classification_Results)] = [
#             data, 'MNB', 0, 'recall', f1_score(y_test, y_pred, average='macro')
#         ]
        
        # SVC
        for c in C:
            clf = SVC(
                C=c,
                kernel='linear',
                class_weight='balanced'
            ).fit(X_train, y_train)

            classifiers['SVM'][data][c].append(clf)

            y_pred = clf.predict(X_test)

            Classification_Results.loc[len(Classification_Results)] = [
                data, 'SVM', c, 'recall', recall_score(y_test, y_pred, average='macro')
            ]
            Classification_Results.loc[len(Classification_Results)] = [
                data, 'SVM', c, 'recall', precision_score(y_test, y_pred, average='macro')
            ]
            Classification_Results.loc[len(Classification_Results)] = [
                data, 'SVM', c, 'recall', f1_score(y_test, y_pred, average='macro')
            ]
        print('Done with SVC')
    
    print('Done with {}'.format(data))
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)
    

Done with LogReg


In [None]:
for data in dataframes:
    y = dataframes[data]['Score'].copy()
    X = dataframes[data].drop(columns=['PairID', 'QueryID', 'Comment', 'Query', 'Score']).copy()
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    skf = StratifiedKFold(n_splits=10)
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # SVC
        for c in C_SVC:
            clf = SVC(
                C=c,
                kernel='linear',
                class_weight='balanced'
            ).fit(X_train, y_train)

            classifiers['SVM'][data][c].append(clf)

            y_pred = clf.predict(X_test)

            Classification_Results.loc[len(Classification_Results)] = [
                data, 'SVM', c, 'recall', recall_score(y_test, y_pred, average='macro')
            ]
            Classification_Results.loc[len(Classification_Results)] = [
                data, 'SVM', c, 'recall', precision_score(y_test, y_pred, average='macro')
            ]
            Classification_Results.loc[len(Classification_Results)] = [
                data, 'SVM', c, 'recall', f1_score(y_test, y_pred, average='macro')
            ]
        print('Done with SVC')

In [58]:
'data/training/no_preprocessing.csv'.split('/')[-1].split('.')[0]

'no_preprocessing'

## No preprocessing

In [2]:
df = pd.read_csv('data/training/no_preprocessing.csv')
df.head(3)

Unnamed: 0,PairID,QueryID,Comment,Query,Score,WordCountComment,WordCountQuery,MutualUnique,MutualWithRepetition,BOW
0,BookStackApp_BookStack_ActivityService_740,0,Daj novu instancu aktivnosti za trenutnog kori...,red sa prioritetom,0,7,3,0,0,0.0
1,BookStackApp_BookStack_ActivityService_740,1,Daj novu instancu aktivnosti za trenutnog kori...,pretvaranje string u datum,0,7,4,0,0,0.0
2,BookStackApp_BookStack_ActivityService_740,2,Daj novu instancu aktivnosti za trenutnog kori...,sortiranje string liste,0,7,3,0,0,0.0


In [48]:
from sklearn.preprocessing import StandardScaler
X = df[['WordCountComment', 'WordCountQuery', 'MutualUnique', 'MutualWithRepetition', 'BOW']]
y = df['Score']

scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


### Logistic regression

In [4]:
from sklearn.linear_model import LogisticRegression

#### Upoređivanje L1 i L2 regularizacije

In [49]:
l1 = LogisticRegression(penalty='l1', solver='saga', class_weight='balanced', max_iter=200).fit(X_train, y_train)
y_pred_l1 = l1.predict(X_test)

l2 = LogisticRegression(penalty='l2', solver='saga', class_weight='balanced', max_iter=200).fit(X_train, y_train)
y_pred_l2 = l2.predict(X_test)

l2_2 = LogisticRegression(penalty='l2', solver='lbfgs', class_weight='balanced', max_iter=200).fit(X_train, y_train)
y_pred_v2 = l2.predict(X_test)




In [50]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_l2)

array([[13830,  8293,  1092,  3914],
       [   28,    34,     4,    25],
       [    9,    19,     2,    11],
       [    7,     9,     1,     6]])

In [51]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_l1)

array([[15012,  2175,  4532,  5410],
       [   31,     9,    17,    34],
       [    9,     7,    11,    14],
       [    8,     4,     3,     8]])

In [52]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_v2)

array([[13830,  8293,  1092,  3914],
       [   28,    34,     4,    25],
       [    9,    19,     2,    11],
       [    7,     9,     1,     6]])

In [38]:
l1 = LogisticRegression(penalty='l1', solver='saga', class_weight='balanced', max_iter=500).fit(X_train, y_train)
l2 = LogisticRegression(penalty='l2', solver='saga', class_weight='balanced', max_iter=500).fit(X_train, y_train)





In [96]:
report_train_df = classification.model_report(X=X_test, y=y_test, models={'L1': l1, 'L2': l2}, data_type='test')
report_train_df

Unnamed: 0,DataType,Model Name,precision_micro,precision_macro,precision_weighted,recall_micro,recall_macro,recall_weighted,F1_micro,F1_macro,F1_weighted,Accuracy,auc_class_0,auc_class_1,auc_class_2,auc_class_3
0,test,L1,0.547464,0.251412,0.991148,0.547464,0.34244,0.547464,0.547464,0.181339,0.704515,0.547464,0.669989,0.518418,0.634174,0.672911
1,test,L2,0.563627,0.25208,0.991182,0.563627,0.343035,0.563627,0.563627,0.185888,0.71768,0.563627,0.670101,0.616508,0.615056,0.647121


In [7]:
score_occurancies = {k: sum(y == k) for k in y.unique()}

In [8]:
score_occurancies

{0: 135725, 1: 413, 2: 158, 3: 120}

In [9]:
non_null = sum(y > 0)
non_null

691

In [10]:
def binary_weights(x):
    if x > 0:
        return 100/non_null
    else:
        return 100/score_occurancies[0]

y_weigths_binary = y.apply(lambda x: binary_weights(x))

In [11]:
y_weigths = y.apply(lambda x: 0.25/score_occurancies[x])

In [103]:


print(f'l2 recall - {recall_score(y_test, y_pred_l2, average="macro", sample_weight=y_weigths_binary)}')
print(f'l1 recall - {recall_score(y_test, y_pred_l1, average="macro", sample_weight=y_weigths_binary)}')
print(f'l2 precision - {precision_score(y_test, y_pred_l2, average="macro", sample_weight=y_weigths_binary)}')
print(f'l1 precision - {precision_score(y_test, y_pred_l1, average="macro", sample_weight=y_weigths_binary)}')
# print(recall_score(y_test, y_pred_l2, average='macro', sample_weight=y_weigths))

l2 recall - 0.34303509448298697
l1 recall - 0.3424403948118258
l2 precision - 0.33537629047132866
l1 precision - 0.3518356495523185


In [40]:
l1_pred = l1.predict(X_test)
l2_pred = l2.predict(X_test)

In [41]:
confusion_matrix(y_test, l1_pred)

array([[13402,  1209,  7810,  4708],
       [   25,    11,    30,    25],
       [    8,     1,    17,    15],
       [    4,     2,     9,     8]])

In [42]:
confusion_matrix(y_test, l2_pred)

array([[15002,  1854,  4807,  5466],
       [   31,    13,    18,    29],
       [    9,     3,    13,    16],
       [    8,     1,     5,     9]])

#### Pronalaženje optimalnog hiperparametra C

In [55]:
C = [0.001, 0.01, 0.1, 1, 10, 100]

skf = StratifiedKFold(n_splits=10)

regularization_type = 'l2'

LogRegRFP = pd.DataFrame(columns=['C', 'met''recall', 'precision', 'f-score'])


if regularization_type == 'l2':
    multiclass_approach = 'multinomial'
    solver = 'saga'
else:
    multiclass_approach = 'ovr'
    solver = 'liblinear'

for train_index, test_index in skf.split(X, y):
    print('Started an itteration')
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    for c in C:
        clf = LogisticRegression(
            penalty=regularization_type, 
            C=c, 
            multi_class=multiclass_approach, 
            solver=solver, 
            max_iter=192,
            class_weight='balanced'
            
        ).fit(X_train, y_train)
        
        y_pred = clf.predict(X_test)
        
        LogRegRFP.loc[len(LogRegRFP)] = [
            c, 
            recall_score(y_test, y_pred, average='macro'),
            precision_score(y_test, y_pred, average='macro'),
            f1_score(y_test, y_pred, average='macro')
        ]
        



Started an itteration
Started an itteration
Started an itteration
Started an itteration
Started an itteration
Started an itteration
Started an itteration
Started an itteration
Started an itteration
Started an itteration


In [15]:
scores

[array([0.99490544, 0.99494191, 0.99494191, 0.99494191, 0.99494191]),
 array([0.99490544, 0.99494191, 0.99494191, 0.99494191, 0.99494191]),
 array([0.99490544, 0.99468534, 0.99494191, 0.99494191, 0.99494191]),
 array([0.99490544, 0.99494191, 0.99494191, 0.99494191, 0.99494191]),
 array([0.99490544, 0.99494191, 0.99494191, 0.99494191, 0.99494191])]

In [37]:
for train_index, test_index in skf.split(X, y):
    print(sum(y[test_index] == 3))
    

(136416,)
(136416,)
(136416,)
(136416,)
(136416,)
(136416,)
(136416,)
(136416,)
(136416,)
(136416,)


In [56]:
LogRegRFP

Unnamed: 0,C,recall,precision,f-score
0,0.001,0.407131,0.253166,0.183394
1,0.01,0.347145,0.251361,0.183578
2,0.1,0.390198,0.251184,0.183901
3,1.0,0.400297,0.251564,0.179838
4,10.0,0.37093,0.251217,0.153349
5,100.0,0.384927,0.252253,0.179161
6,0.001,0.282037,0.250201,0.181223
7,0.01,0.310253,0.251023,0.176388
8,0.1,0.298129,0.250632,0.151902
9,1.0,0.294432,0.250621,0.187236
