In [3]:
import pandas as pd

df = pd.read_csv('data/train.csv')
df_preprocessed = pd.get_dummies(df, drop_first=True, dtype=int).astype(float)
X = df_preprocessed.drop(['accident_risk', 'id'], axis=1)
y = df_preprocessed['accident_risk']

test_df = pd.read_csv('data/test.csv')

# Feature Engineering
I would train a the full data set on three base models:
- LightGBM
- XGBoost
- RandomForest
And check on these that the model will increase or not.

In [19]:
import pandas as pd
from sklearn.model_selection import KFold

In [16]:
df = pd.read_csv('data/train.csv')
df_preprocessed = pd.get_dummies(df, drop_first=True, dtype=int).astype(float)
X = df_preprocessed.drop(['accident_risk', 'id'], axis=1)
y = df_preprocessed['accident_risk']

test_df = pd.read_csv('data/test.csv')

In [25]:
lightgbm_parameters = {
    'num_leaves': 171,
    'max_depth': 74,
    'learning_rate': 0.017777738697293582,
    'n_estimators': 486,
}

xgboost_parameters = {
    'max_leaves': 209,
    'max_depth': 167,
    'learning_rate': 0.02215636350717474,
    'n_estimators': 348,
    'device': 'cuda',
}

randomforest_parameters = {
    'n_estimators': 256,
    'max_depth': 23,
    'min_samples_split': 6,
    'min_samples_leaf': 2,
    'max_features': 'log2'
}

In [26]:
def evaluate_model(model_obj, parameters, kf, X, y):    
    rmses = []
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        train_data = X.iloc[train_index]
        test_data = X.iloc[test_index]

        train_output = y.iloc[train_index]
        test_output = y.iloc[test_index]

        model = model_obj(**parameters)
        model.fit(train_data, train_output)

        y_pred = model.predict(test_data)

        rmse = root_mean_squared_error(y_pred, test_output)
        rmses.append(rmse)

    return np.mean(rmses)

In [27]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np 

kf = KFold(5, shuffle=True)

base_models_cv = {
    'LGBM': evaluate_model(LGBMRegressor, lightgbm_parameters, kf, X, y),
    'XGB': evaluate_model(XGBRegressor, xgboost_parameters, kf, X, y),
    'RF': evaluate_model(RandomForestRegressor, randomforest_parameters, kf, X, y),
}
base_models_cv

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 176
[LightGBM] [Info] Number of data points in the train set: 414203, number of used features: 16
[LightGBM] [Info] Start training from score 0.352414
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006695 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 171
[LightGBM] [Info] Number of data points in the train set: 414203, number of used features: 16
[LightGBM] [Info] Start training from score 0.352093
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003778 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

{'LGBM': np.float64(0.056037552103227695),
 'XGB': np.float64(0.05604319910312414),
 'RF': np.float64(0.057142978032406665)}

## Check out datas

In [38]:
df = pd.read_csv('data/train.csv')
base_risk = (
    0.3 * df["curvature"] + 
    0.2 * (df["lighting"] == "night").astype(int) + 
    0.1 * (df["weather"] != "clear").astype(int) + 
    0.2 * (df["speed_limit"] >= 60).astype(int) + 
    0.1 * (np.array(df["num_reported_accidents"]) > 2).astype(int)
)
df['Meta'] = base_risk

df_preprocessed = pd.get_dummies(df, drop_first=True, dtype=int).astype(float)
X = df_preprocessed.drop(['accident_risk', 'id'], axis=1)
y = df_preprocessed['accident_risk']


test_df = pd.read_csv('data/test.csv')
base_risk = (
    0.3 * test_df["curvature"] + 
    0.2 * (test_df["lighting"] == "night").astype(int) + 
    0.1 * (test_df["weather"] != "clear").astype(int) + 
    0.2 * (test_df["speed_limit"] >= 60).astype(int) + 
    0.1 * (np.array(test_df["num_reported_accidents"]) > 2).astype(int)
)
test_df['Meta'] = base_risk
test_df_preprocessed = pd.get_dummies(test_df, drop_first=True, dtype=int).astype(float)

In [39]:
test_df_preprocessed.to_csv('data/test_pp.csv', index=False)
df_preprocessed.to_csv('data/train_pp.csv', index=False)