In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
from metrics import custom_scoring, cross_valid, test
rus = RandomUnderSampler(random_state=42)

In [None]:
ori_data = pd.read_csv('./data/v2-4.csv', index_col=False)

In [None]:
numerical_features = [
    'Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
    'Wind_Speed(mph)', 'elapsed_time', 'Start_Lat', 'Start_Lng'
]

categorical_features = [f for f in list(ori_data.columns) if (f not in numerical_features)]

In [None]:
X = ori_data.drop(['Severity'], axis=1)
y = ori_data['Severity']

## Model

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [None]:
x_train_valid, x_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

## Direct Testing

### Decision Tree

In [None]:
# dt = DecisionTreeClassifier()
# dt.fit(x_train_valid, y_train_valid)
# test(dt, x_test, y_test)

### Random Forest

In [None]:
# rfc = RandomForestClassifier(n_estimators=50, max_depth=15)
# rfc.fit(x_train_valid, y_train_valid)
# test(rfc, x_test, y_test)

### OneVsRestClassifier with decision tree

In [None]:
# clf = OneVsRestClassifier(DecisionTreeClassifier(max_depth=5)).fit(x_train_valid, y_train_valid)
# test(clf, x_test, y_test)

### OneVsRestClassifier with RandomForest

In [None]:
# clf = OneVsRestClassifier(RandomForestClassifier(max_depth=15)).fit(x_train_valid, y_train_valid)
# test(clf, x_test, y_test)

## **KFold cross validation**

### Decision Tree

Decision tree without balance dataset

In [None]:
dt = DecisionTreeClassifier()
cross_valid(x_train_valid, y_train_valid, dt, verbose=True, balance_cls=False)
test(dt, x_test, y_test)

Decision tree with balance dataset

In [None]:
dt = DecisionTreeClassifier()
cross_valid(x_train_valid, y_train_valid, dt, verbose=True, balance_cls=True)
test(dt, x_test, y_test)

### OneVsRestClassifier with decision tree as base model

without balance dataset

In [None]:
clf = OneVsRestClassifier(DecisionTreeClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=False)
test(clf, x_test, y_test)

with balance dataset

In [None]:
clf = OneVsRestClassifier(DecisionTreeClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=True)
test(clf, x_test, y_test)

### OneVsRestClassifier with Random Forest as base model

Without balance

In [None]:
clf = OneVsRestClassifier(RandomForestClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=False)
test(clf, x_test, y_test)

Balance

In [None]:
clf = OneVsRestClassifier(RandomForestClassifier())
cross_valid(x_train_valid, y_train_valid, clf, verbose=True, balance_cls=True)
test(clf, x_test, y_test)