In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
rus = RandomUnderSampler(random_state=42)

In [2]:
ori_data = pd.read_csv('./data/v2-4.csv', index_col=False)

In [3]:
numerical_features = [
    'Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
    'Wind_Speed(mph)', 'elapsed_time', 'Start_Lat', 'Start_Lng'
]

categorical_features = [f for f in list(ori_data.columns) if (f not in numerical_features)]

In [4]:
X = ori_data.drop(['Severity'], axis=1)
y = ori_data['Severity']

### Self-define score function

In [5]:
def custom_scoring(y_true, y_pred, verbose=False, equal_weighted_f1=False):
    report = classification_report(y_true, y_pred, output_dict=True)
    weighted_beta_f1 = 0
    beta_weights = {
        '1': 0.5,
        '2': 1,
        '3': 1,
        '4': 2,
    }
    total_data_count = report['weighted avg']['support']
    for cl in range(1, 5):
        pr = report[str(cl)]['precision']
        rc = report[str(cl)]['recall']
        beta = beta_weights[str(cl)]
        beta_f1 = ((1+beta**2)*pr*rc)/(pr*(beta**2) + rc)
        if verbose: 
            print(f'beta f1 for level [{cl}]: {beta_f1}, pr: {pr}, rc: {rc}')

        if not equal_weighted_f1:
            support_proportion = report[str(cl)]['support'] / total_data_count
            weighted_beta_f1 += beta_f1 * support_proportion
        else:
            weighted_beta_f1 += beta_f1*0.25

    if verbose and equal_weighted_f1:
        print(f"macro avg for f1: {weighted_beta_f1}")
    return weighted_beta_f1

### K-fold validation, evaluation function

In [6]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_curve, auc

def cross_valid(X, y, estimator, cv=5, verbose=False, balance_cls=False, equal_weighted_f1=False):
    total_f1 = 0
    X.reset_index()
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    print('Validation data')
    for i, (train_index, valid_index) in enumerate(kf.split(X)):
        x_train, x_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        if balance_cls:
            x_train, y_train = rus.fit_resample(x_train, y_train)
            if verbose:
                print('After under sampling:')
                print(f'Length of training data: {len(x_train)}, and its distribution among each severity {Counter(y_train)}')

        estimator.fit(x_train, y_train)
        y_valid_pred = estimator.predict(x_valid)
        beta_f1 = custom_scoring(y_valid, y_valid_pred, verbose=False, equal_weighted_f1=equal_weighted_f1)
        print(f'Round {i} beta_f1: {beta_f1}')
        total_f1 += beta_f1
        
    avg_betaf1 = total_f1 / cv
    print(f'average beta f1-score after kfold: {avg_betaf1}')


def test(estimator, x_test, y_test):
    print('Testing data:')
    y_test_pred = estimator.predict(x_test)
    print(classification_report(y_test, y_test_pred))
    beta_f1 = custom_scoring(y_test, y_test_pred, verbose=True)
    print(f'beta f1-score: {beta_f1}')
        
def auc_pr(estimator, x_test, y_test):
    y_scores = estimator.predict_proba(x_test)[:, 1]
    precision = dict()
    recall = dict()
    n_classes = 4
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_scores[:, i])
        plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
        
    plt.xlabel("recall")
    plt.ylabel("precision")
    plt.legend(loc="best")
    plt.title("precision vs. recall curve")
    plt.show()
        

## Model

In [7]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [8]:
x_train_valid, x_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

## Direct Testing

### Decision Tree

In [9]:
dt = DecisionTreeClassifier()
dt.fit(x_train_valid, y_train_valid)
test(dt, x_test, y_test)

Testing data:
              precision    recall  f1-score   support

           1       0.37      0.41      0.39     13140
           2       0.92      0.92      0.92   1191687
           3       0.71      0.72      0.71    252647
           4       0.34      0.36      0.35     38727

    accuracy                           0.86   1496201
   macro avg       0.58      0.60      0.59   1496201
weighted avg       0.87      0.86      0.86   1496201

beta f1 for level [1]: 0.37362990716922045, pr: 0.36617113105432625, rc: 0.4067732115677321
beta f1 for level [2]: 0.9191545171232114, pr: 0.9214750780129881, rc: 0.9168456146622393
beta f1 for level [3]: 0.7128401850929695, pr: 0.7089679040959667, rc: 0.7167549980803255
beta f1 for level [4]: 0.3518347919545524, pr: 0.33963242922780496, rc: 0.35502362692695016
beta f1-score: 0.8648413055855593


### Random Forest

In [10]:
rfc = RandomForestClassifier(n_estimators=50, max_depth=15)
rfc.fit(x_train_valid, y_train_valid)
test(rfc, x_test, y_test)

Testing data:
              precision    recall  f1-score   support

           1       0.90      0.10      0.18     13140
           2       0.87      0.98      0.92   1191687
           3       0.82      0.48      0.61    252647
           4       0.91      0.01      0.03     38727

    accuracy                           0.86   1496201
   macro avg       0.88      0.39      0.43   1496201
weighted avg       0.86      0.86      0.84   1496201

beta f1 for level [1]: 0.33936170212765954, pr: 0.9017667844522969, rc: 0.09710806697108067
beta f1 for level [2]: 0.919490733990866, pr: 0.8667578447985749, rc: 0.9790557419859409
beta f1 for level [3]: 0.6069696931887656, pr: 0.8212481767597645, rc: 0.4813712412971458
beta f1 for level [4]: 0.018035917517023204, pr: 0.9121951219512195, rc: 0.014486017507165544
beta f1-score: 0.8382910559049715


### OneVsRestClassifier with decision tree

In [11]:
clf = OneVsRestClassifier(DecisionTreeClassifier(max_depth=5)).fit(x_train_valid, y_train_valid)
test(clf, x_test, y_test)

Testing data:
              precision    recall  f1-score   support

           1       0.73      0.03      0.06     13140
           2       0.85      0.97      0.91   1191687
           3       0.74      0.40      0.52    252647
           4       0.74      0.01      0.01     38727

    accuracy                           0.84   1496201
   macro avg       0.77      0.35      0.38   1496201
weighted avg       0.83      0.84      0.81   1496201

beta f1 for level [1]: 0.1342412451361868, pr: 0.7263157894736842, rc: 0.031506849315068496
beta f1 for level [2]: 0.9087880746137068, pr: 0.8534444640962151, rc: 0.9718071943387819
beta f1 for level [3]: 0.520455376713993, pr: 0.7355233289475587, rc: 0.4027041682663954
beta f1 for level [4]: 0.008276812686389313, pr: 0.744927536231884, rc: 0.0066361969685232525
beta f1-score: 0.8131039144599154


### OneVsRestClassifier with RandomForest

In [12]:
clf = OneVsRestClassifier(RandomForestClassifier(max_depth=15)).fit(x_train_valid, y_train_valid)
test(clf, x_test, y_test)

Testing data:
              precision    recall  f1-score   support

           1       0.84      0.15      0.25     13140
           2       0.87      0.98      0.92   1191687
           3       0.82      0.49      0.61    252647
           4       0.88      0.03      0.06     38727

    accuracy                           0.86   1496201
   macro avg       0.85      0.41      0.46   1496201
weighted avg       0.86      0.86      0.84   1496201

beta f1 for level [1]: 0.43670156139105754, pr: 0.837515950659294, rc: 0.14984779299847792
beta f1 for level [2]: 0.9205968645781485, pr: 0.86911590219355, rc: 0.9785606455386355
beta f1 for level [3]: 0.6142178027392642, pr: 0.8215675743772336, rc: 0.490439229438703
beta f1 for level [4]: 0.03617458559291106, pr: 0.8835027365129007, rc: 0.0291786092390322
beta f1-score: 0.8417203254510202


## **KFold cross validation**

### Decision Tree

Decision tree without balance dataset

In [13]:
dt = DecisionTreeClassifier()
cross_valid(x_train_valid, y_train_valid, dt, verbose=True, balance_cls=False, equal_weighted_f1=True)
test(dt, x_test, y_test)

Validation data
Round 0 beta_f1: 0.580193543669992
Round 1 beta_f1: 0.5772234555145531
Round 2 beta_f1: 0.5754072620427306
Round 3 beta_f1: 0.5810758784312391
Round 4 beta_f1: 0.5783983145866527
average beta f1-score after kfold: 0.5784596908490335
Testing data:
              precision    recall  f1-score   support

           1       0.35      0.39      0.37     13140
           2       0.92      0.91      0.92   1191687
           3       0.70      0.70      0.70    252647
           4       0.33      0.34      0.33     38727

    accuracy                           0.86   1496201
   macro avg       0.57      0.59      0.58   1496201
weighted avg       0.86      0.86      0.86   1496201

beta f1 for level [1]: 0.3597785977859779, pr: 0.3525785905074995, rc: 0.3917808219178082
beta f1 for level [2]: 0.9163375402627796, pr: 0.9186047786382239, rc: 0.9140814660225378
beta f1 for level [3]: 0.7008308667285041, pr: 0.6974123604665747, rc: 0.7042830510554252
beta f1 for level [4]: 0.3382203

Decision tree with balance dataset

In [14]:
dt = DecisionTreeClassifier()
cross_valid(x_train_valid, y_train_valid, dt, verbose=True, balance_cls=True, equal_weighted_f1=True)
test(dt, x_test, y_test)

Validation data
After under sampling:
Length of training data: 167888, and its distribution among each severity Counter({1: 41972, 2: 41972, 3: 41972, 4: 41972})
Round 0 beta_f1: 0.4256565240979158
After under sampling:
Length of training data: 168796, and its distribution among each severity Counter({1: 42199, 2: 42199, 3: 42199, 4: 42199})
Round 1 beta_f1: 0.4284265706449773
After under sampling:
Length of training data: 168524, and its distribution among each severity Counter({1: 42131, 2: 42131, 3: 42131, 4: 42131})
Round 2 beta_f1: 0.4270808646509177
After under sampling:
Length of training data: 168052, and its distribution among each severity Counter({1: 42013, 2: 42013, 3: 42013, 4: 42013})
Round 3 beta_f1: 0.42556061318497174
After under sampling:
Length of training data: 167684, and its distribution among each severity Counter({1: 41921, 2: 41921, 3: 41921, 4: 41921})
Round 4 beta_f1: 0.42801343045078577
average beta f1-score after kfold: 0.42694760060591364
Testing data:
   

### OneVsRestClassifier with decision tree as base model

without balance dataset

In [15]:
clf = OneVsRestClassifier(DecisionTreeClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=False, equal_weighted_f1=True)
test(clf, x_test, y_test)

Validation data
Round 0 beta_f1: 0.6176041931450036
Round 1 beta_f1: 0.6169444525288569
Round 2 beta_f1: 0.6157151080500922
Round 3 beta_f1: 0.6205029830756664
Round 4 beta_f1: 0.6157815213023662
average beta f1-score after kfold: 0.6173096516203971
Testing data:
              precision    recall  f1-score   support

           1       0.68      0.29      0.41     13140
           2       0.95      0.87      0.90   1191687
           3       0.70      0.70      0.70    252647
           4       0.13      0.49      0.20     38727

    accuracy                           0.82   1496201
   macro avg       0.62      0.59      0.56   1496201
weighted avg       0.88      0.82      0.85   1496201

beta f1 for level [1]: 0.5400716364450413, pr: 0.683306779961055, rc: 0.2937595129375951
beta f1 for level [2]: 0.9045621401744798, pr: 0.9451181486839145, rc: 0.8673435222503896
beta f1 for level [3]: 0.7017144036908515, pr: 0.7038970866439652, rc: 0.6995452152608185
beta f1 for level [4]: 0.3143277

with balance dataset

In [16]:
clf = OneVsRestClassifier(DecisionTreeClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=True, equal_weighted_f1=True)
test(clf, x_test, y_test)

Validation data
After under sampling:
Length of training data: 167888, and its distribution among each severity Counter({1: 41972, 2: 41972, 3: 41972, 4: 41972})
Round 0 beta_f1: 0.39169999023593943
After under sampling:
Length of training data: 168796, and its distribution among each severity Counter({1: 42199, 2: 42199, 3: 42199, 4: 42199})
Round 1 beta_f1: 0.3923815858009088
After under sampling:
Length of training data: 168524, and its distribution among each severity Counter({1: 42131, 2: 42131, 3: 42131, 4: 42131})
Round 2 beta_f1: 0.3929587762643762
After under sampling:
Length of training data: 168052, and its distribution among each severity Counter({1: 42013, 2: 42013, 3: 42013, 4: 42013})
Round 3 beta_f1: 0.39395352954115154
After under sampling:
Length of training data: 167684, and its distribution among each severity Counter({1: 41921, 2: 41921, 3: 41921, 4: 41921})
Round 4 beta_f1: 0.39329184233732095
average beta f1-score after kfold: 0.39285714483593936
Testing data:
  

### OneVsRestClassifier with Random Forest as base model

Without balance

In [17]:
clf = OneVsRestClassifier(RandomForestClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=False, equal_weighted_f1=True)
test(clf, x_test, y_test)

Validation data
Round 0 beta_f1: 0.6433005435587963
Round 1 beta_f1: 0.6412018500555894
Round 2 beta_f1: 0.6431255012023854
Round 3 beta_f1: 0.6447565281213
Round 4 beta_f1: 0.6448057622574268
average beta f1-score after kfold: 0.6434380370390996
Testing data:
              precision    recall  f1-score   support

           1       0.87      0.26      0.40     13140
           2       0.91      0.97      0.94   1191687
           3       0.84      0.69      0.76    252647
           4       0.55      0.25      0.35     38727

    accuracy                           0.90   1496201
   macro avg       0.79      0.54      0.61   1496201
weighted avg       0.89      0.90      0.89   1496201

beta f1 for level [1]: 0.5939472961844633, pr: 0.86528367908023, rc: 0.2634703196347032
beta f1 for level [2]: 0.9382141328529187, pr: 0.9106120808188983, rc: 0.967541812573268
beta f1 for level [3]: 0.7579423373613712, pr: 0.8388326945429047, rc: 0.6912807197393992
beta f1 for level [4]: 0.282578613074

Balance

In [18]:
clf = OneVsRestClassifier(RandomForestClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=True, equal_weighted_f1=True)
test(clf, x_test, y_test)

Validation data
After under sampling:
Length of training data: 167888, and its distribution among each severity Counter({1: 41972, 2: 41972, 3: 41972, 4: 41972})
Round 0 beta_f1: 0.46595173248051747
After under sampling:
Length of training data: 168796, and its distribution among each severity Counter({1: 42199, 2: 42199, 3: 42199, 4: 42199})
Round 1 beta_f1: 0.4665954173794879
After under sampling:
Length of training data: 168524, and its distribution among each severity Counter({1: 42131, 2: 42131, 3: 42131, 4: 42131})
Round 2 beta_f1: 0.4655583321854403
After under sampling:
Length of training data: 168052, and its distribution among each severity Counter({1: 42013, 2: 42013, 3: 42013, 4: 42013})
Round 3 beta_f1: 0.46462977550085216
After under sampling:
Length of training data: 167684, and its distribution among each severity Counter({1: 41921, 2: 41921, 3: 41921, 4: 41921})
Round 4 beta_f1: 0.4637156881971976
average beta f1-score after kfold: 0.4652901891486991
Testing data:
    