In [None]:
from sklearn.metrics import classification_report
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)


In [None]:
ori_data = pd.read_csv('./data/v2-4.csv', index_col=False)


In [None]:
X = ori_data.drop(['Severity'], axis=1)
y = ori_data['Severity']


### Self-define score function

In [None]:
def custom_scoring(y_true, y_pred, verbose=False, equal_weighted_f1=False):
    report = classification_report(y_true, y_pred, output_dict=True)
    weighted_beta_f1 = 0
    beta_weights = {
        '1': 0.5,
        '2': 1,
        '3': 1,
        '4': 2,
    }
    total_data_count = report['weighted avg']['support']
    for cl in range(1, 5):
        pr = report[str(cl)]['precision']
        rc = report[str(cl)]['recall']
        beta = beta_weights[str(cl)]
        beta_f1 = ((1+beta**2)*pr*rc)/(pr*(beta**2) + rc)
        if verbose: 
            print(f'beta f1 for level [{cl}]: {beta_f1}, pr: {pr}, rc: {rc}')

        if not equal_weighted_f1:
            support_proportion = report[str(cl)]['support'] / total_data_count
            weighted_beta_f1 += beta_f1 * support_proportion
        else:
            weighted_beta_f1 += beta_f1*0.25

    if verbose and equal_weighted_f1:
        print(f"macro avg for f1: {weighted_beta_f1}")
    return weighted_beta_f1

### K-fold validation, evaluation function


In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_curve
from collections import Counter
from matplotlib import pyplot as plt

def cross_valid(X, y, estimator, cv=5, verbose=False, balance_cls=False, equal_weighted_f1=False):
    total_f1 = 0
    X.reset_index()
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    print('Validation data')
    for i, (train_index, valid_index) in enumerate(kf.split(X)):
        x_train, x_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        if balance_cls:
            x_train, y_train = rus.fit_resample(x_train, y_train)
            if verbose:
                print('After under sampling:')
                print(f'Length of training data: {len(x_train)}, and its distribution among each severity {Counter(y_train)}')

        estimator.fit(x_train, y_train)
        y_valid_pred = estimator.predict(x_valid)
        beta_f1 = custom_scoring(y_valid, y_valid_pred, verbose=False, equal_weighted_f1=equal_weighted_f1)
        print(f'Round {i} beta_f1: {beta_f1}')
        total_f1 += beta_f1
        
    avg_betaf1 = total_f1 / cv
    print(f'average beta f1-score after kfold: {avg_betaf1}')


def test(estimator, x_test, y_test):
    print('Testing data:')
    y_test_pred = estimator.predict(x_test)
    print(classification_report(y_test, y_test_pred))
    beta_f1 = custom_scoring(y_test, y_test_pred, verbose=True)
    print(f'beta f1-score: {beta_f1}')
        
def auc_pr(estimator, x_test, y_test):
    y_scores = estimator.predict_proba(x_test)[:, 1]
    precision = dict()
    recall = dict()
    n_classes = 4
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_scores[:, i])
        plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
        
    plt.xlabel("recall")
    plt.ylabel("precision")
    plt.legend(loc="best")
    plt.title("precision vs. recall curve")
    plt.show()
        


## Model

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split


In [None]:
x_train_valid, x_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, stratify=y)


## Direct Testing

### AdaBoost


In [None]:
ada_clf = AdaBoostClassifier(n_estimators=100)
ada_clf.fit(x_train_valid, y_train_valid)
test(ada_clf, x_test, y_test)

## KFold cross validation


### AdaBoost

AdaBoost without balance dataset

In [None]:
ada_clf = AdaBoostClassifier(n_estimators=100)
cross_valid(x_train_valid, y_train_valid, ada_clf, verbose=True, balance_cls=False, equal_weighted_f1=True)
test(ada_clf, x_test, y_test)

AdaBoost with balance dataset

In [None]:
ada_clf = AdaBoostClassifier(n_estimators=100)
cross_valid(x_train_valid, y_train_valid, ada_clf, verbose=True, balance_cls=True, equal_weighted_f1=True)
test(ada_clf, x_test, y_test)
