In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
from metrics import custom_scoring, cross_valid, test
rus = RandomUnderSampler(random_state=42)

In [2]:
ori_data = pd.read_csv('./data/baseline.csv', index_col=False)

In [3]:
numerical_features = [
    'Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
    'Wind_Speed(mph)', 'elapsed_time', 'Start_Lat', 'Start_Lng'
]

categorical_features = [f for f in list(ori_data.columns) if (f not in numerical_features)]

In [4]:
X = ori_data.drop(['Severity'], axis=1)
y = ori_data['Severity']

## Model

In [5]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [6]:
x_train_valid, x_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

## Direct Testing

### Decision Tree

In [7]:
# dt = DecisionTreeClassifier()
# dt.fit(x_train_valid, y_train_valid)
# test(dt, x_test, y_test)

### Random Forest

In [8]:
# rfc = RandomForestClassifier(n_estimators=50, max_depth=15)
# rfc.fit(x_train_valid, y_train_valid)
# test(rfc, x_test, y_test)

### OneVsRestClassifier with decision tree

In [9]:
# clf = OneVsRestClassifier(DecisionTreeClassifier(max_depth=5)).fit(x_train_valid, y_train_valid)
# test(clf, x_test, y_test)

### OneVsRestClassifier with RandomForest

In [10]:
# clf = OneVsRestClassifier(RandomForestClassifier(max_depth=15)).fit(x_train_valid, y_train_valid)
# test(clf, x_test, y_test)

## **KFold cross validation**

### Decision Tree

Decision tree without balance dataset

In [11]:
dt = DecisionTreeClassifier()
cross_valid(x_train_valid, y_train_valid, dt, verbose=True, balance_cls=False)
test(dt, x_test, y_test)

Validation data
macro beta f1: 0.6086062673044836
weighted beta f1: 0.9346550649206863
Round 0 macro beta f1-score: 0.6086062673044836
Round 0 weighted beta f1-score: 0.9346550649206863
macro beta f1: 0.6143317753714886
weighted beta f1: 0.934601062796632
Round 1 macro beta f1-score: 0.6143317753714886
Round 1 weighted beta f1-score: 0.934601062796632
macro beta f1: 0.61300616665544
weighted beta f1: 0.9345490461352982
Round 2 macro beta f1-score: 0.61300616665544
Round 2 weighted beta f1-score: 0.9345490461352982
macro beta f1: 0.6145712414771322
weighted beta f1: 0.9346577980040108
Round 3 macro beta f1-score: 0.6145712414771322
Round 3 weighted beta f1-score: 0.9346577980040108
macro beta f1: 0.6133052855329997
weighted beta f1: 0.9346016540657781
Round 4 macro beta f1-score: 0.6133052855329997
Round 4 weighted beta f1-score: 0.9346016540657781
average macro beta f1-score after kfold: 0.6127641472683087
average weighted beta f1-score after kfold: 0.9346129251844811
Testing data:
   

Decision tree with balance dataset

In [12]:
dt = DecisionTreeClassifier()
cross_valid(x_train_valid, y_train_valid, dt, verbose=True, balance_cls=True)
test(dt, x_test, y_test)

Validation data
After under sampling:
Length of training data: 65536, and its distribution among each severity Counter({1: 16384, 2: 16384, 3: 16384, 4: 16384})
macro beta f1: 0.4429413214141771
weighted beta f1: 0.7993421491865639
Round 0 macro beta f1-score: 0.4429413214141771
Round 0 weighted beta f1-score: 0.7993421491865639
After under sampling:
Length of training data: 65460, and its distribution among each severity Counter({1: 16365, 2: 16365, 3: 16365, 4: 16365})
macro beta f1: 0.4443633617561563
weighted beta f1: 0.8018803961707125
Round 1 macro beta f1-score: 0.4443633617561563
Round 1 weighted beta f1-score: 0.8018803961707125
After under sampling:
Length of training data: 65300, and its distribution among each severity Counter({1: 16325, 2: 16325, 3: 16325, 4: 16325})
macro beta f1: 0.4444187339946868
weighted beta f1: 0.8014897033366749
Round 2 macro beta f1-score: 0.4444187339946868
Round 2 weighted beta f1-score: 0.8014897033366749
After under sampling:
Length of trainin

### OneVsRestClassifier with decision tree as base model

without balance dataset

In [13]:
clf = OneVsRestClassifier(DecisionTreeClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=False)
test(clf, x_test, y_test)

Validation data
macro beta f1: 0.6392673641654405
weighted beta f1: 0.9307670931434202
Round 0 macro beta f1-score: 0.6392673641654405
Round 0 weighted beta f1-score: 0.9307670931434202
macro beta f1: 0.6403381192861543
weighted beta f1: 0.930338337148393
Round 1 macro beta f1-score: 0.6403381192861543
Round 1 weighted beta f1-score: 0.930338337148393
macro beta f1: 0.639237350914595
weighted beta f1: 0.9301678926078267
Round 2 macro beta f1-score: 0.639237350914595
Round 2 weighted beta f1-score: 0.9301678926078267
macro beta f1: 0.6382444403559981
weighted beta f1: 0.9301988147944072
Round 3 macro beta f1-score: 0.6382444403559981
Round 3 weighted beta f1-score: 0.9301988147944072
macro beta f1: 0.6394434227163995
weighted beta f1: 0.9305266610757716
Round 4 macro beta f1-score: 0.6394434227163995
Round 4 weighted beta f1-score: 0.9305266610757716
average macro beta f1-score after kfold: 0.6393061394877175
average weighted beta f1-score after kfold: 0.9303997597539638
Testing data:
 

with balance dataset

In [14]:
clf = OneVsRestClassifier(DecisionTreeClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=True)
test(clf, x_test, y_test)

Validation data
After under sampling:
Length of training data: 65536, and its distribution among each severity Counter({1: 16384, 2: 16384, 3: 16384, 4: 16384})
macro beta f1: 0.441149950806238
weighted beta f1: 0.7119110899924829
Round 0 macro beta f1-score: 0.441149950806238
Round 0 weighted beta f1-score: 0.7119110899924829
After under sampling:
Length of training data: 65460, and its distribution among each severity Counter({1: 16365, 2: 16365, 3: 16365, 4: 16365})
macro beta f1: 0.44445923439244955
weighted beta f1: 0.7164537227551993
Round 1 macro beta f1-score: 0.44445923439244955
Round 1 weighted beta f1-score: 0.7164537227551993
After under sampling:
Length of training data: 65300, and its distribution among each severity Counter({1: 16325, 2: 16325, 3: 16325, 4: 16325})
macro beta f1: 0.45047386944582773
weighted beta f1: 0.7191107651074691
Round 2 macro beta f1-score: 0.45047386944582773
Round 2 weighted beta f1-score: 0.7191107651074691
After under sampling:
Length of train

### OneVsRestClassifier with Random Forest as base model

Without balance

In [15]:
clf = OneVsRestClassifier(RandomForestClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=False)
test(clf, x_test, y_test)

Validation data
macro beta f1: 0.6488772486366947
weighted beta f1: 0.9429423027034506
Round 0 macro beta f1-score: 0.6488772486366947
Round 0 weighted beta f1-score: 0.9429423027034506
macro beta f1: 0.6501802664754587
weighted beta f1: 0.9423878111127767
Round 1 macro beta f1-score: 0.6501802664754587
Round 1 weighted beta f1-score: 0.9423878111127767
macro beta f1: 0.6534128975905689
weighted beta f1: 0.9428832733208781
Round 2 macro beta f1-score: 0.6534128975905689
Round 2 weighted beta f1-score: 0.9428832733208781
macro beta f1: 0.651374438292921
weighted beta f1: 0.9425046273947294
Round 3 macro beta f1-score: 0.651374438292921
Round 3 weighted beta f1-score: 0.9425046273947294
macro beta f1: 0.6550878663748447
weighted beta f1: 0.9432110289396924
Round 4 macro beta f1-score: 0.6550878663748447
Round 4 weighted beta f1-score: 0.9432110289396924
average macro beta f1-score after kfold: 0.6517865434740976
average weighted beta f1-score after kfold: 0.9427858086943054
Testing data:

Balance

In [16]:
clf = OneVsRestClassifier(RandomForestClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=True)
test(clf, x_test, y_test)

Validation data
After under sampling:
Length of training data: 65536, and its distribution among each severity Counter({1: 16384, 2: 16384, 3: 16384, 4: 16384})
macro beta f1: 0.47728331997897794
weighted beta f1: 0.8325959975988974
Round 0 macro beta f1-score: 0.47728331997897794
Round 0 weighted beta f1-score: 0.8325959975988974
After under sampling:
Length of training data: 65460, and its distribution among each severity Counter({1: 16365, 2: 16365, 3: 16365, 4: 16365})
macro beta f1: 0.4811369641284083
weighted beta f1: 0.8337410815192192
Round 1 macro beta f1-score: 0.4811369641284083
Round 1 weighted beta f1-score: 0.8337410815192192
After under sampling:
Length of training data: 65300, and its distribution among each severity Counter({1: 16325, 2: 16325, 3: 16325, 4: 16325})
macro beta f1: 0.4805144255395421
weighted beta f1: 0.8316958832099013
Round 2 macro beta f1-score: 0.4805144255395421
Round 2 weighted beta f1-score: 0.8316958832099013
After under sampling:
Length of train