In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

ori_data = pd.read_csv('./data/v2-1.csv', index_col=False)


In [4]:
numerical_features = [
    'Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
    'Wind_Speed(mph)', 'elapsed_time'
]

info_cols = ['Severity', 'Start_Lat', 'Start_Lng', 'Weather_Timestamp', 'Hour', 'Day', 'Minute']
categorical_features = [f for f in list(ori_data.columns) if (f not in numerical_features and f not in info_cols)]



In [None]:
label_encoder = preprocessing.LabelEncoder()
for column in categorical_features:
    ori_data[column] = label_encoder.fit_transform(ori_data[column])
    ori_data[column] = ori_data[column].astype(np.int32)

# process continous value to float32
for column in numerical_features:
    ori_data[column] = ori_data[column].astype(np.float32)

In [None]:
ori_data.loc[0]

In [5]:
ori_data.to_csv('./data/v3.csv', index = False)

# Predict

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

In [2]:
ori_data = pd.read_csv('./data/v3.csv', index_col=False)
numerical_features = [
    'Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
    'Wind_Speed(mph)', 'elapsed_time'
]

info_cols = ['Severity', 'Start_Lat', 'Start_Lng', 'Weather_Timestamp', 'Hour', 'Day', 'Minute']
categorical_features = [f for f in list(ori_data.columns) if (f not in numerical_features and f not in info_cols)]

In [5]:
X = ori_data.drop(info_cols, axis=1)
y = ori_data['Severity']

In [6]:
def custom_scoring(y_true, y_pred, verbose=False):
    report = classification_report(y_true, y_pred, output_dict=True)
    total_f1 = 0
    beta_weights = {
        '1': 0.5,
        '2': 1,
        '3': 1,
        '4': 2,
    }
    for cl in range(1, 5):
        pr = report[str(cl)]['precision']
        rc = report[str(cl)]['recall']
        beta = beta_weights[str(cl)]
        beta_f1 = ((1+beta**2)*pr*rc)/(pr*(beta**2) + rc)
        if verbose: 
            print(f'beta f1 for level [{cl}]: {beta_f1}, pr: {pr}, rc: {rc}')
        total_f1 += beta_f1

    avg_f1 = total_f1/4
    if verbose:
        print(f"macro avg for f1: {avg_f1}")
    return avg_f1

In [33]:
from sklearn.model_selection import KFold

def cross_valid(X, y, estimator, cv=5, verbose=False):
    round = 1
    total_f1 = 0
    x_train_valid, x_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    x_train_valid.reset_index()
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    print('Validation data')
    for i, (train_index, valid_index) in enumerate(kf.split(x_train_valid)):
        x_train, x_valid = x_train_valid.iloc[train_index], x_train_valid.iloc[valid_index]
        y_train, y_valid = y_train_valid.iloc[train_index], y_train_valid.iloc[valid_index]
        
        x_train_balan, y_train_balan = rus.fit_resample(x_train, y_train)
        # if verbose:
        #     print('After under sampling:')
        #     print(f'Length of training data: {len(x_train_balan)}, and its distribution among each severity {Counter(y_train_balan)}')

        estimator.fit(x_train_balan, y_train_balan)
        y_valid_pred = estimator.predict(x_valid)
        beta_f1 = custom_scoring(y_valid, y_valid_pred, verbose=False)
        print(f'Round {round} beta_f1: {beta_f1}')
        total_f1 += beta_f1
        round += 1
        
    avg_betaf1 = total_f1 / cv
    print(f'average beta f1-score: {avg_betaf1}')

    print('Testing data')
    y_test_pred = estimator.predict(x_test)
    print(classification_report(y_test, y_test_pred))
    beta_f1 = custom_scoring(y_test, y_test_pred, verbose=True)
    print(f'beta f1-score: {beta_f1}')
        

        

## Imbalance testing data

In [34]:
dt = DecisionTreeClassifier()
cross_valid(X, y, dt, verbose=True)

Validation data
Round 1 beta_f1: 0.4120755017415445
Round 2 beta_f1: 0.4132266914387351
Round 3 beta_f1: 0.41248355656196034
Round 4 beta_f1: 0.41139465321208984
Round 5 beta_f1: 0.41079151409006953
average beta f1-score: 0.4119943834088799
Testing data
              precision    recall  f1-score   support

           1       0.08      0.81      0.14     13140
           2       0.93      0.57      0.70   1191687
           3       0.43      0.64      0.52    252647
           4       0.11      0.71      0.18     38727

    accuracy                           0.58   1496201
   macro avg       0.39      0.68      0.39   1496201
weighted avg       0.81      0.58      0.65   1496201

beta f1 for level [1]: 0.09687839305103149, pr: 0.07939050161272383, rc: 0.8148401826484019
beta f1 for level [2]: 0.7031377639078272, pr: 0.9262645882372307, rc: 0.5666404013805638
beta f1 for level [3]: 0.5165989732725422, pr: 0.43269490297943075, rc: 0.6408704635321218
beta f1 for level [4]: 0.3312326133747

In [40]:
abc = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=15), n_estimators=100)
abc.fit(x_res, y_res)
y_pred = abc.predict(x_res_test)

print(classification_report(y_res_test, y_pred))
custom_scoring(y_res_test, y_pred, verbose=True)

              precision    recall  f1-score   support

           1       0.85      0.89      0.87     13140
           2       0.69      0.53      0.60     13140
           3       0.72      0.74      0.73     13140
           4       0.71      0.82      0.76     13140

    accuracy                           0.75     52560
   macro avg       0.74      0.75      0.74     52560
weighted avg       0.74      0.75      0.74     52560

beta f1 for level [1]: 0.8610980068404293, pr: 0.8543845534995977, rc: 0.8890410958904109
beta f1 for level [2]: 0.6013835775362008, pr: 0.690614822856015, rc: 0.532572298325723
beta f1 for level [3]: 0.7303509362036102, pr: 0.7175589336858339, rc: 0.743607305936073
beta f1 for level [4]: 0.7945699218576894, pr: 0.7107088590870053, rc: 0.8187214611872146
macro avg for f1: 0.7468506106094824


0.7468506106094824