In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
rus = RandomUnderSampler(random_state=42)

In [7]:
ori_data = pd.read_csv('./data/v2-4.csv', index_col=False)

In [8]:
numerical_features = [
    'Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
    'Wind_Speed(mph)', 'elapsed_time', 'Start_Lat', 'Start_Lng'
]

categorical_features = [f for f in list(ori_data.columns) if (f not in numerical_features)]

In [9]:
# X_b = ori_data.drop(info_cols, axis=1)
X = ori_data.drop(['Severity'], axis=1)
y = ori_data['Severity']

In [16]:
def custom_scoring(y_true, y_pred, verbose=False):
    report = classification_report(y_true, y_pred, output_dict=True)
    weighted_beta_f1 = 0
    beta_weights = {
        '1': 0.5,
        '2': 1,
        '3': 1,
        '4': 2,
    }
    # print(report)
    total_data_count = report['weighted avg']['support']
    for cl in range(1, 5):
        pr = report[str(cl)]['precision']
        rc = report[str(cl)]['recall']
        beta = beta_weights[str(cl)]
        beta_f1 = ((1+beta**2)*pr*rc)/(pr*(beta**2) + rc)
        if verbose: 
            print(f'beta f1 for level [{cl}]: {beta_f1}, pr: {pr}, rc: {rc}')

        support_proportion = report[str(cl)]['support'] / total_data_count
        weighted_beta_f1 += beta_f1 * support_proportion

    if verbose:
        print(f"macro avg for f1: {weighted_beta_f1}")
    return weighted_beta_f1

In [26]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_curve, auc

def cross_valid(X, y, estimator, cv=5, verbose=False):
    total_f1 = 0
    X.reset_index()
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    print('Validation data')
    for i, (train_index, valid_index) in enumerate(kf.split(X)):
        x_train, x_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        x_train_balan, y_train_balan = rus.fit_resample(x_train, y_train)
        if verbose:
            print('After under sampling:')
            print(f'Length of training data: {len(x_train_balan)}, and its distribution among each severity {Counter(y_train_balan)}')

        estimator.fit(x_train_balan, y_train_balan)
        y_valid_pred = estimator.predict(x_valid)
        beta_f1 = custom_scoring(y_valid, y_valid_pred, verbose=False)
        print(f'Round {i} beta_f1: {beta_f1}')
        total_f1 += beta_f1
        
    avg_betaf1 = total_f1 / cv
    print(f'average beta f1-score after kfold: {avg_betaf1}')


def test(estimator, x_test, y_test):
    print('Testing data:')
    y_test_pred = estimator.predict(x_test)
    print(classification_report(y_test, y_test_pred))
    beta_f1 = custom_scoring(y_test, y_test_pred, verbose=True)
    print(f'beta f1-score: {beta_f1}')
        
def auc_pr(estimator, x_test, y_test):
    y_scores = estimator.predict_proba(x_test)[:, 1]
    precision = dict()
    recall = dict()
    n_classes = 4
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_scores[:, i])
        plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
        
    plt.xlabel("recall")
    plt.ylabel("precision")
    plt.legend(loc="best")
    plt.title("precision vs. recall curve")
    plt.show()
        

## Model

In [None]:
x_train_valid, x_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [23]:
x_train_valid, x_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
dt = DecisionTreeClassifier()
cross_valid(x_train_valid, y_train_valid, dt, verbose=True)
test(dt, x_test, y_test)

Validation data
After under sampling:
Length of training data: 168316, and its distribution among each severity Counter({1: 42079, 2: 42079, 3: 42079, 4: 42079})
Round 0 beta_f1: 0.6743087649416701
After under sampling:
Length of training data: 167940, and its distribution among each severity Counter({1: 41985, 2: 41985, 3: 41985, 4: 41985})
Round 1 beta_f1: 0.6756818602948657
After under sampling:
Length of training data: 167924, and its distribution among each severity Counter({1: 41981, 2: 41981, 3: 41981, 4: 41981})
Round 2 beta_f1: 0.6766462053608783
After under sampling:
Length of training data: 168480, and its distribution among each severity Counter({1: 42120, 2: 42120, 3: 42120, 4: 42120})
Round 3 beta_f1: 0.6718258381337472
After under sampling:
Length of training data: 168284, and its distribution among each severity Counter({1: 42071, 2: 42071, 3: 42071, 4: 42071})
Round 4 beta_f1: 0.6744575353594671
average beta f1-score after kfold: 0.6745840408181257
Testing data:
      

In [24]:
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(DecisionTreeClassifier())

cross_valid(x_train_valid, y_train_valid, clf, verbose=True)
test(clf, x_test, y_test)

Validation data
After under sampling:
Length of training data: 168316, and its distribution among each severity Counter({1: 42079, 2: 42079, 3: 42079, 4: 42079})
Round 0 beta_f1: 0.5823601850660766
After under sampling:
Length of training data: 167940, and its distribution among each severity Counter({1: 41985, 2: 41985, 3: 41985, 4: 41985})
Round 1 beta_f1: 0.5850641210123708
After under sampling:
Length of training data: 167924, and its distribution among each severity Counter({1: 41981, 2: 41981, 3: 41981, 4: 41981})
Round 2 beta_f1: 0.5863385157724416
After under sampling:
Length of training data: 168480, and its distribution among each severity Counter({1: 42120, 2: 42120, 3: 42120, 4: 42120})
Round 3 beta_f1: 0.5808432141821133
After under sampling:
Length of training data: 168284, and its distribution among each severity Counter({1: 42071, 2: 42071, 3: 42071, 4: 42071})
Round 4 beta_f1: 0.5844132016657172
average beta f1-score after kfold: 0.5838038475397439
Testing data:
      

In [25]:
clf = OneVsRestClassifier(DecisionTreeClassifier()).fit(x_train_valid, y_train_valid)
test(clf, x_test, y_test)

Testing data:
              precision    recall  f1-score   support

           1       0.70      0.31      0.43     13140
           2       0.95      0.87      0.91   1191687
           3       0.71      0.71      0.71    252647
           4       0.13      0.49      0.21     38727

    accuracy                           0.83   1496201
   macro avg       0.62      0.59      0.56   1496201
weighted avg       0.88      0.83      0.85   1496201

beta f1 for level [1]: 0.5571057575422699, pr: 0.6996877168632893, rc: 0.3069254185692542
beta f1 for level [2]: 0.9073552951967069, pr: 0.9461156329243056, rc: 0.8716458264628212
beta f1 for level [3]: 0.712954568448868, pr: 0.714680030227101, rc: 0.7112374182159298
beta f1 for level [4]: 0.31863647572687626, pr: 0.13368150284511654, rc: 0.4871278436233119
macro avg for f1: 0.8562148715122896
beta f1-score: 0.8562148715122896
