In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

In [2]:
ori_data = pd.read_csv('./data/v2-3.csv', index_col=False)
numerical_features = [
    'Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
    'Wind_Speed(mph)', 'elapsed_time'
]

info_cols = ['Severity', 'Start_Lat', 'Start_Lng', 'Weather_Timestamp', 'Hour', 'Day', 'Minute']
categorical_features = [f for f in list(ori_data.columns) if (f not in numerical_features and f not in info_cols)]

In [7]:
# X = ori_data.drop(info_cols, axis=1)
X = ori_data.drop(['Severity'], axis=1)
y = ori_data['Severity']

In [5]:
def custom_scoring(y_true, y_pred, verbose=False):
    report = classification_report(y_true, y_pred, output_dict=True)
    weighted_beta_f1 = 0
    beta_weights = {
        '1': 0.5,
        '2': 1,
        '3': 1,
        '4': 2,
    }
    print(report)
    total_data_count = report['weighted avg']['support']
    for cl in range(1, 5):
        pr = report[str(cl)]['precision']
        rc = report[str(cl)]['recall']
        beta = beta_weights[str(cl)]
        beta_f1 = ((1+beta**2)*pr*rc)/(pr*(beta**2) + rc)
        if verbose: 
            print(f'beta f1 for level [{cl}]: {beta_f1}, pr: {pr}, rc: {rc}')

        support_proportion = report[str(cl)]['support'] / total_data_count
        weighted_beta_f1 += beta_f1 * support_proportion

    if verbose:
        print(f"macro avg for f1: {weighted_beta_f1}")
    return weighted_beta_f1

In [6]:
from sklearn.model_selection import KFold

def cross_valid(X, y, estimator, cv=5, verbose=False):
    round = 1
    total_f1 = 0
    x_train_valid, x_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    x_train_valid.reset_index()
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    print('Validation data')
    for i, (train_index, valid_index) in enumerate(kf.split(x_train_valid)):
        x_train, x_valid = x_train_valid.iloc[train_index], x_train_valid.iloc[valid_index]
        y_train, y_valid = y_train_valid.iloc[train_index], y_train_valid.iloc[valid_index]
        
        x_train_balan, y_train_balan = rus.fit_resample(x_train, y_train)
        if verbose:
            print('After under sampling:')
            print(f'Length of training data: {len(x_train_balan)}, and its distribution among each severity {Counter(y_train_balan)}')

        estimator.fit(x_train_balan, y_train_balan)
        y_valid_pred = estimator.predict(x_valid)
        beta_f1 = custom_scoring(y_valid, y_valid_pred, verbose=False)
        print(f'Round {round} beta_f1: {beta_f1}')
        total_f1 += beta_f1
        round += 1
        
    avg_betaf1 = total_f1 / cv
    print(f'average beta f1-score: {avg_betaf1}')

    print('Testing data')
    y_test_pred = estimator.predict(x_test)
    print(classification_report(y_test, y_test_pred))
    beta_f1 = custom_scoring(y_test, y_test_pred, verbose=True)
    print(f'beta f1-score: {beta_f1}')
        

        

## Imbalance testing data

In [9]:
dt = DecisionTreeClassifier()
cross_valid(X, y, dt, verbose=True)

Validation data
Round 1 beta_f1: 0.4271604031648384
Round 2 beta_f1: 0.427099841295834
Round 3 beta_f1: 0.42763935014333465
Round 4 beta_f1: 0.4265807389799974
Round 5 beta_f1: 0.4289833962211148
average beta f1-score: 0.4274927459610239
Testing data
              precision    recall  f1-score   support

           1       0.09      0.84      0.17     13140
           2       0.93      0.59      0.72   1191687
           3       0.45      0.67      0.54    252647
           4       0.11      0.71      0.19     38727

    accuracy                           0.61   1496201
   macro avg       0.40      0.70      0.40   1496201
weighted avg       0.82      0.61      0.67   1496201

beta f1 for level [1]: 0.11258944452185603, pr: 0.09256389090112704, rc: 0.8363013698630137
beta f1 for level [2]: 0.723507933455832, pr: 0.9339817114367898, rc: 0.5904495056168272
beta f1 for level [3]: 0.5392407985048071, pr: 0.4531206646211535, rc: 0.6657787347563993
beta f1 for level [4]: 0.33637400228050174,

In [40]:
abc = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=15), n_estimators=100)
abc.fit(x_res, y_res)
y_pred = abc.predict(x_res_test)

print(classification_report(y_res_test, y_pred))
custom_scoring(y_res_test, y_pred, verbose=True)

              precision    recall  f1-score   support

           1       0.85      0.89      0.87     13140
           2       0.69      0.53      0.60     13140
           3       0.72      0.74      0.73     13140
           4       0.71      0.82      0.76     13140

    accuracy                           0.75     52560
   macro avg       0.74      0.75      0.74     52560
weighted avg       0.74      0.75      0.74     52560

beta f1 for level [1]: 0.8610980068404293, pr: 0.8543845534995977, rc: 0.8890410958904109
beta f1 for level [2]: 0.6013835775362008, pr: 0.690614822856015, rc: 0.532572298325723
beta f1 for level [3]: 0.7303509362036102, pr: 0.7175589336858339, rc: 0.743607305936073
beta f1 for level [4]: 0.7945699218576894, pr: 0.7107088590870053, rc: 0.8187214611872146
macro avg for f1: 0.7468506106094824


0.7468506106094824