In [1]:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

In [2]:
ori_data = pd.read_csv('./data/v2-4.csv', index_col=False)

In [3]:
X = ori_data.drop(['Severity'], axis=1)
y = ori_data['Severity']


In [4]:
def custom_scoring(y_true, y_pred, verbose=False, equal_weighted_f1=False):
    report = classification_report(y_true, y_pred, output_dict=True)
    weighted_beta_f1 = 0
    beta_weights = {
        '1': 0.5,
        '2': 1,
        '3': 1,
        '4': 2,
    }
    total_data_count = report['weighted avg']['support']
    for cl in range(1, 5):
        pr = report[str(cl)]['precision']
        rc = report[str(cl)]['recall']
        beta = beta_weights[str(cl)]
        beta_f1 = ((1+beta**2)*pr*rc)/(pr*(beta**2) + rc)
        if verbose: 
            print(f'beta f1 for level [{cl}]: {beta_f1}, pr: {pr}, rc: {rc}')

        if not equal_weighted_f1:
            support_proportion = report[str(cl)]['support'] / total_data_count
            weighted_beta_f1 += beta_f1 * support_proportion
        else:
            weighted_beta_f1 += beta_f1*0.25

    if verbose and equal_weighted_f1:
        print(f"macro avg for f1: {weighted_beta_f1}")
    return weighted_beta_f1


In [5]:
def cross_valid(X, y, estimator, cv=5, verbose=False, balance_cls=False, equal_weighted_f1=False):
    total_f1 = 0
    X.reset_index()
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    print('Validation data')
    for i, (train_index, valid_index) in enumerate(kf.split(X)):
        x_train, x_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        if balance_cls:
            x_train, y_train = rus.fit_resample(x_train, y_train)
            if verbose:
                print('After under sampling:')
                print(f'Length of training data: {len(x_train)}, and its distribution among each severity {Counter(y_train)}')

        estimator.fit(x_train, y_train)
        y_valid_pred = estimator.predict(x_valid)
        beta_f1 = custom_scoring(y_valid, y_valid_pred, verbose=False, equal_weighted_f1=equal_weighted_f1)
        print(f'Round {i} beta_f1: {beta_f1}')
        total_f1 += beta_f1
        
    avg_betaf1 = total_f1 / cv
    print(f'average beta f1-score after kfold: {avg_betaf1}')


def test(estimator, x_test, y_test):
    print('Testing data:')
    y_test_pred = estimator.predict(x_test)
    print(classification_report(y_test, y_test_pred))
    beta_f1 = custom_scoring(y_test, y_test_pred, verbose=True)
    print(f'beta f1-score: {beta_f1}')
        
def auc_pr(estimator, x_test, y_test):
    y_scores = estimator.predict_proba(x_test)[:, 1]
    precision = dict()
    recall = dict()
    n_classes = 4
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_scores[:, i])
        plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
        
    plt.xlabel("recall")
    plt.ylabel("precision")
    plt.legend(loc="best")
    plt.title("precision vs. recall curve")
    plt.show()
        


In [6]:
x_train_valid, x_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [7]:
X = ori_data.drop(['Severity'], axis=1)
y = ori_data['Severity']

In [8]:
lin_clf = svm.LinearSVC()
lin_clf.fit(x_train_valid, y_train_valid)
test(lin_clf, x_test, y_test)



Testing data:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.00      0.00      0.00     13140
           2       0.81      0.15      0.25   1191687
           3       0.17      0.86      0.29    252647
           4       0.12      0.00      0.00     38727

    accuracy                           0.27   1496201
   macro avg       0.28      0.25      0.13   1496201
weighted avg       0.68      0.27      0.25   1496201



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ZeroDivisionError: float division by zero