In [6]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import AdaBoostClassifier
rus = RandomUnderSampler(random_state=42)


In [7]:
def custom_scoring(y_true, y_pred, verbose=False):
    report = classification_report(y_true, y_pred, output_dict=True)
    total_f1 = 0
    beta_weights = {
        '1': 0.5,
        '2': 1,
        '3': 1,
        '4': 2,
    }
    for cl in range(1, 5):
        pr = report[str(cl)]['precision']
        rc = report[str(cl)]['recall']
        beta = beta_weights[str(cl)]
        beta_f1 = ((1+beta**2)*pr*rc)/(pr*(beta**2) + rc)
        if verbose: 
            print(f'beta f1 for level [{cl}]: {beta_f1}, pr: {pr}, rc: {rc}')
        total_f1 += beta_f1

    avg_f1 = total_f1/4
    if verbose:
        print(f"macro avg for f1: {avg_f1}")
    return avg_f1

def cross_valid(X, y, estimator, cv=5, verbose=False):
    round = 1
    total_f1 = 0
    x_train_valid, x_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    x_train_valid.reset_index()
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    print('Validation data')
    for i, (train_index, valid_index) in enumerate(kf.split(x_train_valid)):
        x_train, x_valid = x_train_valid.iloc[train_index], x_train_valid.iloc[valid_index]
        y_train, y_valid = y_train_valid.iloc[train_index], y_train_valid.iloc[valid_index]
        
        x_train_balan, y_train_balan = rus.fit_resample(x_train, y_train)
        # if verbose:
        #     print('After under sampling:')
        #     print(f'Length of training data: {len(x_train_balan)}, and its distribution among each severity {Counter(y_train_balan)}')

        estimator.fit(x_train_balan, y_train_balan)
        y_valid_pred = estimator.predict(x_valid)
        beta_f1 = custom_scoring(y_valid, y_valid_pred, verbose=False)
        print(f'Round {round} beta_f1: {beta_f1}')
        total_f1 += beta_f1
        round += 1
        
    avg_betaf1 = total_f1 / cv
    print(f'average beta f1-score: {avg_betaf1}')

    print('Testing data')
    y_test_pred = estimator.predict(x_test)
    print(classification_report(y_test, y_test_pred))
    beta_f1 = custom_scoring(y_test, y_test_pred, verbose=True)
    print(f'beta f1-score: {beta_f1}')

In [8]:
ori_data = pd.read_csv('./data/v2-4.csv', index_col=False)


In [9]:
X = ori_data.drop(['Severity'], axis=1)
y = ori_data['Severity']


In [10]:
clf = AdaBoostClassifier(n_estimators=100)
cross_valid(X, y, clf, verbose=True)

Validation data
Round 1 beta_f1: 0.36220969886891213
Round 2 beta_f1: 0.3665334695041274
Round 3 beta_f1: 0.36813004699879154
Round 4 beta_f1: 0.3656167302070351
Round 5 beta_f1: 0.37081120791729877
average beta f1-score: 0.3666602306992329
Testing data
              precision    recall  f1-score   support

           1       0.06      0.87      0.11     13140
           2       0.93      0.45      0.61   1191687
           3       0.47      0.64      0.54    252647
           4       0.07      0.71      0.13     38727

    accuracy                           0.50   1496201
   macro avg       0.38      0.67      0.35   1496201
weighted avg       0.83      0.50      0.58   1496201

beta f1 for level [1]: 0.07524203825023926, pr: 0.061257654516299144, rc: 0.8663622526636225
beta f1 for level [2]: 0.610540735579274, pr: 0.9332782065734345, rc: 0.4536602312519982
beta f1 for level [3]: 0.5442799786675253, pr: 0.4722271532795735, rc: 0.642279544186157
beta f1 for level [4]: 0.254640256797443