In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
from metrics import custom_scoring, cross_valid, test
rus = RandomUnderSampler(random_state=42)

In [2]:
ori_data = pd.read_csv('./data/v2-4.csv', index_col=False)

In [3]:
numerical_features = [
    'Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
    'Wind_Speed(mph)', 'elapsed_time', 'Start_Lat', 'Start_Lng'
]

categorical_features = [f for f in list(ori_data.columns) if (f not in numerical_features)]

In [4]:
X = ori_data.drop(['Severity'], axis=1)
y = ori_data['Severity']

## Model

In [5]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [6]:
x_train_valid, x_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

## Direct Testing

### Decision Tree

In [7]:
# dt = DecisionTreeClassifier()
# dt.fit(x_train_valid, y_train_valid)
# test(dt, x_test, y_test)

### Random Forest

In [8]:
# rfc = RandomForestClassifier(n_estimators=50, max_depth=15)
# rfc.fit(x_train_valid, y_train_valid)
# test(rfc, x_test, y_test)

### OneVsRestClassifier with decision tree

In [9]:
# clf = OneVsRestClassifier(DecisionTreeClassifier(max_depth=5)).fit(x_train_valid, y_train_valid)
# test(clf, x_test, y_test)

### OneVsRestClassifier with RandomForest

In [10]:
# clf = OneVsRestClassifier(RandomForestClassifier(max_depth=15)).fit(x_train_valid, y_train_valid)
# test(clf, x_test, y_test)

## **KFold cross validation**

### Decision Tree

Decision tree without balance dataset

In [11]:
dt = DecisionTreeClassifier()
cross_valid(x_train_valid, y_train_valid, dt, verbose=True, balance_cls=False)
test(dt, x_test, y_test)

Validation data
macro beta f1: 0.5767200341950788
weighted beta f1: 0.8603618097399257
Round 0 macro beta f1-score: 0.5767200341950788
Round 0 weighted beta f1-score: 0.8603618097399257
macro beta f1: 0.5816996222316548
weighted beta f1: 0.8608728014833144
Round 1 macro beta f1-score: 0.5816996222316548
Round 1 weighted beta f1-score: 0.8608728014833144
macro beta f1: 0.5790598641559622
weighted beta f1: 0.8609877676545479
Round 2 macro beta f1-score: 0.5790598641559622
Round 2 weighted beta f1-score: 0.8609877676545479
macro beta f1: 0.5781349810257188
weighted beta f1: 0.8606886054403797
Round 3 macro beta f1-score: 0.5781349810257188
Round 3 weighted beta f1-score: 0.8606886054403797
macro beta f1: 0.5795187831305665
weighted beta f1: 0.8609567724736372
Round 4 macro beta f1-score: 0.5795187831305665
Round 4 weighted beta f1-score: 0.8609567724736372
average macro beta f1-score after kfold: 0.5790266569477962
average weighted beta f1-score after kfold: 0.860773551358361
Testing data

Decision tree with balance dataset

In [12]:
dt = DecisionTreeClassifier()
cross_valid(x_train_valid, y_train_valid, dt, verbose=True, balance_cls=True)
test(dt, x_test, y_test)

Validation data
After under sampling:
Length of training data: 168452, and its distribution among each severity Counter({1: 42113, 2: 42113, 3: 42113, 4: 42113})
macro beta f1: 0.4255558169627195
weighted beta f1: 0.6732030862765519
Round 0 macro beta f1-score: 0.4255558169627195
Round 0 weighted beta f1-score: 0.6732030862765519
After under sampling:
Length of training data: 168160, and its distribution among each severity Counter({1: 42040, 2: 42040, 3: 42040, 4: 42040})
macro beta f1: 0.42651509089473505
weighted beta f1: 0.6731982357939555
Round 1 macro beta f1-score: 0.42651509089473505
Round 1 weighted beta f1-score: 0.6731982357939555
After under sampling:
Length of training data: 168652, and its distribution among each severity Counter({1: 42163, 2: 42163, 3: 42163, 4: 42163})
macro beta f1: 0.42639392314889324
weighted beta f1: 0.6758421890830225
Round 2 macro beta f1-score: 0.42639392314889324
Round 2 weighted beta f1-score: 0.6758421890830225
After under sampling:
Length of 

### OneVsRestClassifier with decision tree as base model

without balance dataset

In [13]:
clf = OneVsRestClassifier(DecisionTreeClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=False)
test(clf, x_test, y_test)

Validation data
macro beta f1: 0.6177335333829257
weighted beta f1: 0.853150661871581
Round 0 macro beta f1-score: 0.6177335333829257
Round 0 weighted beta f1-score: 0.853150661871581
macro beta f1: 0.6183928954962548
weighted beta f1: 0.853590758535722
Round 1 macro beta f1-score: 0.6183928954962548
Round 1 weighted beta f1-score: 0.853590758535722
macro beta f1: 0.6133590057851268
weighted beta f1: 0.8524980057162639
Round 2 macro beta f1-score: 0.6133590057851268
Round 2 weighted beta f1-score: 0.8524980057162639
macro beta f1: 0.6167409335534524
weighted beta f1: 0.852933813759346
Round 3 macro beta f1-score: 0.6167409335534524
Round 3 weighted beta f1-score: 0.852933813759346
macro beta f1: 0.6197746702876641
weighted beta f1: 0.853298089636416
Round 4 macro beta f1-score: 0.6197746702876641
Round 4 weighted beta f1-score: 0.853298089636416
average macro beta f1-score after kfold: 0.6172002077010847
average weighted beta f1-score after kfold: 0.8530942659038656
Testing data:
     

with balance dataset

In [14]:
clf = OneVsRestClassifier(DecisionTreeClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=True)
test(clf, x_test, y_test)

Validation data
After under sampling:
Length of training data: 168452, and its distribution among each severity Counter({1: 42113, 2: 42113, 3: 42113, 4: 42113})
macro beta f1: 0.39047739896892447
weighted beta f1: 0.5817509070206165
Round 0 macro beta f1-score: 0.39047739896892447
Round 0 weighted beta f1-score: 0.5817509070206165
After under sampling:
Length of training data: 168160, and its distribution among each severity Counter({1: 42040, 2: 42040, 3: 42040, 4: 42040})
macro beta f1: 0.39131335458123473
weighted beta f1: 0.5818926810190825
Round 1 macro beta f1-score: 0.39131335458123473
Round 1 weighted beta f1-score: 0.5818926810190825
After under sampling:
Length of training data: 168652, and its distribution among each severity Counter({1: 42163, 2: 42163, 3: 42163, 4: 42163})
macro beta f1: 0.39315873116756606
weighted beta f1: 0.588154574035084
Round 2 macro beta f1-score: 0.39315873116756606
Round 2 weighted beta f1-score: 0.588154574035084
After under sampling:
Length of 

### OneVsRestClassifier with Random Forest as base model

Without balance

In [15]:
clf = OneVsRestClassifier(RandomForestClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=False)
test(clf, x_test, y_test)

Validation data
macro beta f1: 0.6418780682098323
weighted beta f1: 0.8876684508310535
Round 0 macro beta f1-score: 0.6418780682098323
Round 0 weighted beta f1-score: 0.8876684508310535
macro beta f1: 0.6448523267162342
weighted beta f1: 0.8876109198462475
Round 1 macro beta f1-score: 0.6448523267162342
Round 1 weighted beta f1-score: 0.8876109198462475
macro beta f1: 0.6414362416704645
weighted beta f1: 0.8874386922757742
Round 2 macro beta f1-score: 0.6414362416704645
Round 2 weighted beta f1-score: 0.8874386922757742
macro beta f1: 0.6445348434879534
weighted beta f1: 0.8880354984040277
Round 3 macro beta f1-score: 0.6445348434879534
Round 3 weighted beta f1-score: 0.8880354984040277
macro beta f1: 0.6415594925561677
weighted beta f1: 0.8875217328390691
Round 4 macro beta f1-score: 0.6415594925561677
Round 4 weighted beta f1-score: 0.8875217328390691
average macro beta f1-score after kfold: 0.6428521945281305
average weighted beta f1-score after kfold: 0.8876550588392345
Testing dat

Balance

In [16]:
clf = OneVsRestClassifier(RandomForestClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=True)
test(clf, x_test, y_test)

Validation data
After under sampling:
Length of training data: 168452, and its distribution among each severity Counter({1: 42113, 2: 42113, 3: 42113, 4: 42113})
macro beta f1: 0.46619710336394715
weighted beta f1: 0.7025510142032034
Round 0 macro beta f1-score: 0.46619710336394715
Round 0 weighted beta f1-score: 0.7025510142032034
After under sampling:
Length of training data: 168160, and its distribution among each severity Counter({1: 42040, 2: 42040, 3: 42040, 4: 42040})
macro beta f1: 0.46605442837926514
weighted beta f1: 0.7017701261183652
Round 1 macro beta f1-score: 0.46605442837926514
Round 1 weighted beta f1-score: 0.7017701261183652
After under sampling:
Length of training data: 168652, and its distribution among each severity Counter({1: 42163, 2: 42163, 3: 42163, 4: 42163})
macro beta f1: 0.4662861652531417
weighted beta f1: 0.7029282611502463
Round 2 macro beta f1-score: 0.4662861652531417
Round 2 weighted beta f1-score: 0.7029282611502463
After under sampling:
Length of 