<font size="+0.5">Load libraries and data.</font>

In [None]:
# Ignore warnings 
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, Normalizer, MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight

In [None]:
train_acc = pd.read_csv('data/train_filtered_accelerometer.csv')
train_gyr = pd.read_csv('data/train_filtered_gyroscope.csv')

val_acc = pd.read_csv('data/val_filtered_accelerometer.csv')
val_gyr = pd.read_csv('data/val_filtered_gyroscope.csv')

train = pd.concat([train_acc, train_gyr.drop(["event"], axis=1)], axis=1)
val = pd.concat([val_acc, val_gyr.drop(["event"], axis=1)], axis=1)

# <center> Data preprocessing

<font size="+0.5">Label encoding.</font>

<font size="+0.5">Divide data into 3 categories then encode them.</font>

In [None]:
label_dict = {"Aggressive acceleration": "critical",
         "Aggressive breaking": "critical", 
         "Aggressive left lane change": "significant",
         "Aggressive left turn": "significant",
         "Aggressive right lane change": "significant",
         "Aggressive right turn": "significant",
         "Non-aggressive event": "negligible"}

In [None]:
def change_label(row):
    return label_dict[row['event']]

In [None]:
train['event_label'] = train.apply(change_label, axis=1)
val['event_label'] = val.apply(change_label, axis=1)

In [None]:
encoder = LabelEncoder()
train['event_label_num'] = encoder.fit_transform(train['event_label'])
val['event_label_num'] = encoder.transform(val['event_label'])

In [None]:
with open("models/encoder.pcl", "wb") as file:
    pickle.dump(encoder, file)

<font size="+0.5">Look at histogramm of the target class.</font>

In [None]:
train['event_label'].hist();

In [None]:
val['event_label'].hist();

<font size="+0.5">Data normalisation.</font>

<font size="+0.5">Linear models have to work with normilized data.</font>

In [None]:
normalizer = StandardScaler()

train['x_accelerometer_scaled'] = normalizer.fit_transform(train['x_accelerometer'].values.reshape(-1, 1))
val['x_accelerometer_scaled'] = normalizer.transform(val['x_accelerometer'].values.reshape(-1, 1))
with open("models/x_accelerometer.pcl", "wb") as file:
    pickle.dump(normalizer, file)

train['y_accelerometer_scaled'] = normalizer.fit_transform(train['y_accelerometer'].values.reshape(-1, 1))
val['y_accelerometer_scaled'] = normalizer.transform(val['y_accelerometer'].values.reshape(-1, 1))
with open("models/y_accelerometer.pcl", "wb") as file:
    pickle.dump(normalizer, file)

train['z_accelerometer_scaled'] = normalizer.fit_transform(train['z_accelerometer'].values.reshape(-1, 1))
val['z_accelerometer_scaled'] = normalizer.transform(val['z_accelerometer'].values.reshape(-1, 1))
with open("models/z_accelerometer.pcl", "wb") as file:
    pickle.dump(normalizer, file)

<font size="+0.5">Plots for showing difference in the filtered  and normilized values.</font>

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(15, 5))
    
ax[0].plot(train[:100]['x_accelerometer'].values, label='x accelerometer')
ax[0].plot(train[:100]['x_accelerometer_scaled'].values, label='accelerometer x normalizing')
ax[0].legend();

ax[1].plot(train[:100]['y_accelerometer'].values, label='y accelerometer')
ax[1].plot(train[:100]['y_accelerometer_scaled'].values, label='accelerometer y normalizing')
ax[1].legend();

ax[2].plot(train[:100]['z_accelerometer'].values, label='z accelerometer')
ax[2].plot(train[:100]['z_accelerometer_scaled'].values, label='accelerometer z normalizing')
ax[2].legend();

plt.title('Filtered accelerometer axis normalizing', fontsize=20)
plt.show();

# <center> Feature engineering

<font size="+0.5">Add acceleration feature</font>

In [None]:
train['acceleration'] = np.sqrt(train['x_accelerometer']**2 + train['y_accelerometer']**2 + train['z_accelerometer']**2)
train['acceleration_scaled'] = np.sqrt(train['x_accelerometer_scaled']**2 + train['y_accelerometer_scaled']**2 + train['z_accelerometer_scaled']**2)

val['acceleration'] = np.sqrt(val['x_accelerometer']**2 + val['y_accelerometer']**2 + val['z_accelerometer']**2)
val['acceleration_scaled'] = np.sqrt(val['x_accelerometer_scaled']**2 + val['y_accelerometer_scaled']**2 + val['z_accelerometer_scaled']**2)

# <center> Training and evaluating

### <center> Linear models

<font size="+0.5">Split data into train and test</font>

In [None]:
x_train = train[['x_accelerometer_scaled', 'y_accelerometer_scaled', 'z_accelerometer_scaled', 'acceleration_scaled', 'x_gyroscope', 'y_gyroscope', 'z_gyroscope']]
y_train = train['event_label_num'].values
x_val = val[['x_accelerometer_scaled', 'y_accelerometer_scaled', 'z_accelerometer_scaled', 'acceleration_scaled', 'x_gyroscope', 'y_gyroscope', 'z_gyroscope']]
y_val = val['event_label_num'].values

<font size="+0.5">On the histogramm of the target class we see that it isn't balanced. We can't use cutting or dublicating for balancing data, so we dicided to use class weight.</font>

In [None]:
y = np.append(y_train, y_val)

In [None]:
class_weight = dict(zip(np.unique(y), compute_class_weight('balanced', np.unique(y), y)))

In [None]:
clf_lr = LogisticRegression(class_weight=class_weight, random_state = 42)
clf_lr.fit(x_train, y_train)
lr_predict_val = clf_lr.predict(x_val)
lr_predict_train = clf_lr.predict(x_train)
print(f"Train accuracy: {accuracy_score(lr_predict_train, y_train)}")
print(f"Validation accuracy: {accuracy_score(lr_predict_val, y_val)}")

In [None]:
clf_pac = PassiveAggressiveClassifier(class_weight=class_weight, random_state=42)
clf_pac.fit(x_train, y_train)
pac_predict_val = clf_pac.predict(x_val)
pac_predict_train = clf_pac.predict(x_train)
print(f"Train accuracy: {accuracy_score(pac_predict_train, y_train)}")
print(f"Validation accuracy: {accuracy_score(pac_predict_val, y_val)}")

In [None]:
clf_svc = SVC(class_weight=class_weight, random_state=42)
clf_svc.fit(x_train, y_train)
svc_predict_val = clf_svc.predict(x_val)
svc_predict_train = clf_svc.predict(x_train)
print(f"Train accuracy: {accuracy_score(svc_predict_train, y_train)}")
print(f"Validation accuracy: {accuracy_score(svc_predict_val, y_val)}")

In [None]:
clf_pcp = MLPClassifier(random_state=42, solver='lbfgs', max_iter=1000, hidden_layer_sizes=50)
clf_pcp.fit(x_train, y_train)
pcp_predict_val = clf_pcp.predict(x_val)
pcp_predict_train = clf_pcp.predict(x_train)
print(f"Train accuracy: {accuracy_score(pcp_predict_train, y_train)}")
print(f"Validation accuracy: {accuracy_score(pcp_predict_val, y_val)}")

<font size="+0.5">Use GridSearch for find better parameters for best linear models.</font>

In [None]:
def get_cv(train, test, n_folds=5):
    
    cv = []
    for i in range(n_folds):
        cv.append((train.sample(frac = 0.8).index.tolist(), test.sample(frac = 0.8).index.tolist()))
    
    return cv

In [None]:
param_svc = {
    "C": np.arange(8, 12, 0.5),
    'gamma':  np.arange(1.5, 3.1, 0.1)
            }
clf_grid = GridSearchCV(
    SVC(class_weight=class_weight, random_state=42),
    param_svc,
    scoring=make_scorer(accuracy_score),
    verbose=1,
    n_jobs=-1,
    cv=get_cv(x_train, x_val, n_folds=4) 
)
clf_grid.fit(x_train, y_train);

In [None]:
best_svc_estimator = clf_grid.best_estimator_
grid_predict_val = best_svc_estimator.predict(x_val)
grid_predict_train = best_svc_estimator.predict(x_train)
print(f"Train accuracy: {accuracy_score(grid_predict_train, y_train)}")
print(f"Validation accuracy: {accuracy_score(grid_predict_val, y_val)}")

In [None]:
best_svc_estimator.get_params

### <center> Non-linear models

<font size="+0.5">Split data into train and test</font>

In [None]:
x_train = train[['x_accelerometer', 'y_accelerometer', 'z_accelerometer', 'acceleration', 'x_gyroscope', 'y_gyroscope', 'z_gyroscope']]
y_train = train['event_label_num'].values
x_val = val[['x_accelerometer', 'y_accelerometer', 'z_accelerometer', 'acceleration', 'x_gyroscope', 'y_gyroscope', 'z_gyroscope']]
y_val = val['event_label_num'].values

In [None]:
clf_gbc = GradientBoostingClassifier()
clf_gbc.fit(x_train, y_train)
gbc_predict_val = clf_gbc.predict(x_val)
gbc_predict_train = clf_gbc.predict(x_train)
print(f"Train accuracy: {accuracy_score(gbc_predict_train, y_train)}")
print(f"Validation accuracy: {accuracy_score(gbc_predict_val, y_val)}")

In [None]:
clf_rfc = RandomForestClassifier(n_estimators=100, class_weight=class_weight, random_state=42)
clf_rfc.fit(x_train, y_train)
rfc_predict_val = clf_rfc.predict(x_val)
rfc_predict_train = clf_rfc.predict(x_train)
print(f"Train accuracy: {accuracy_score(rfc_predict_train, y_train)}")
print(f"Validation accuracy: {accuracy_score(rfc_predict_val, y_val)}")

In [None]:
clf_dtc = DecisionTreeClassifier(random_state=42, class_weight=class_weight)
clf_dtc.fit(x_train, y_train)
dtc_predict_val = clf_dtc.predict(x_val)
dtc_predict_train = clf_dtc.predict(x_train)
print(f"Train accuracy: {accuracy_score(dtc_predict_train, y_train)}")
print(f"Validation accuracy: {accuracy_score(dtc_predict_val, y_val)}")

<font size="+0.5">Use GridSearch for find better parameters for best non-linear model.</font>

In [None]:
param_forest = {"n_estimators": np.arange(180, 220, 10),
                "max_depth": np.arange(36, 46, 2)}

clf_grid = GridSearchCV(RandomForestClassifier(n_jobs=-1, class_weight=class_weight, random_state=42),
                        param_forest,
                        scoring=make_scorer(accuracy_score),
                        verbose=1,
                        n_jobs=-1,
                        cv=get_cv(x_train, x_val, n_folds=4)
                       )
clf_grid.fit(x_train, y_train);

In [None]:
best_forest_estimator = clf_grid.best_estimator_
grid_predict_val = best_forest_estimator.predict(x_val)
grid_predict_train = best_forest_estimator.predict(x_train)
print(f"Train accuracy: {accuracy_score(grid_predict_train, y_train)}")
print(f"Validation accuracy: {accuracy_score(grid_predict_val, y_val)}")

<font size="+0.5">In the end we dicided to check work of the boosting for this task.</font>

In [None]:
clf_lgb = lgb.LGBMClassifier(
    n_estimators = 800,
    seed = 42,
    learning_rate = 0.015,
    metric = 'multi_logloss',
    objective = 'multiclass',
    early_stopping = 20)

In [None]:
clf_lgb.fit(X=x_train, y=y_train, eval_set=(x_val, y_val), verbose=0)
lgb_predict_val = clf_lgb.predict(x_val)
lgb_predict_train = clf_lgb.predict(x_train)
print(f"Train accuracy: {accuracy_score(lgb_predict_train, y_train)}")
print(f"Validation accuracy: {accuracy_score(lgb_predict_val, y_val)}")

<font size="+0.5">Save best models.</font>

In [None]:
with open("models/non-linear-accelerometer-gyroscope.pcl", "wb") as file:
    pickle.dump(clf_lgb, file)

In [None]:
with open("models/linear-accelerometer-gyroscope.pcl", "wb") as file:
    pickle.dump(clf_svc, file)