In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import warnings
warnings.filterwarnings('ignore')

In [None]:
path = "../input/Foml-2021/"
train = pd.read_csv(path+"train.csv", parse_dates=['Crash Date/Time'])
test = pd.read_csv(path+"test.csv", parse_dates=['Crash Date/Time'])

# EDA

In [None]:
train.head()

In [None]:
train.info(verbose=True, null_counts=True)

In [None]:
train.duplicated().sum()

In [None]:
train.isnull().sum()/len(train) * 100

In [None]:
train = train.drop(['Off-Road Description', 'Municipality', 'Related Non-Motorist', 'Non-Motorist Substance Abuse', 'Circumstance', 'Equipment Problems'], axis=1)
test = test.drop(['Off-Road Description', 'Municipality', 'Related Non-Motorist', 'Non-Motorist Substance Abuse', 'Circumstance', 'Equipment Problems'], axis=1)

In [None]:
train.head()

In [None]:
train.columns

In [None]:
train = train.drop(['Report Number', 'Local Case Number', 'Agency Name', 'Road Name', 'Cross-Street Name', 'Person ID', 'Vehicle ID', 'Vehicle Model', 'Latitude', 'Longitude', 'Vehicle Make', 'Location'], axis=1)
test = test.drop(['Report Number', 'Local Case Number', 'Agency Name', 'Road Name', 'Cross-Street Name', 'Person ID', 'Vehicle ID', 'Vehicle Model', 'Latitude', 'Longitude', 'Vehicle Make', 'Location'], axis=1)

In [None]:
train.head()

In [None]:
train.columns

In [None]:
def add_dateparts(df, col):
    df['year'] = df[col].dt.year
    df['month'] = df[col].dt.month
    df['day'] = df[col].dt.day
    df['weekday'] = df[col].dt.weekday
    df['hour'] = df[col].dt.hour

In [None]:
add_dateparts(train, 'Crash Date/Time')
add_dateparts(test, 'Crash Date/Time')
train = train.drop(['Crash Date/Time'], axis=1)
test = test.drop(['Crash Date/Time'], axis=1)

In [None]:
train.columns

In [None]:
train.head()

In [None]:
cat_cols = ['ACRS Report Type', 'Route Type', 'Cross-Street Type', 'Collision Type',
       'Weather', 'Surface Condition', 'Light', 'Traffic Control',
       'Driver Substance Abuse', 'Injury Severity', 'Drivers License State',
       'Vehicle Damage Extent', 'Vehicle First Impact Location',
       'Vehicle Second Impact Location', 'Vehicle Body Type',
       'Vehicle Movement', 'Vehicle Continuing Dir', 'Vehicle Going Dir',
      'Driverless Vehicle', 'Parked Vehicle']

In [None]:
for col in cat_cols:
    print(train[col].value_counts(), '\n')

In [None]:
train.info(verbose=True, null_counts=True)

In [None]:
train = train.fillna(train.median())
test = test.fillna(test.median())

In [None]:
train.info(verbose=True, null_counts=True)

In [None]:
categorical_columns = train.select_dtypes(['object']).columns

In [None]:
categorical_columns

In [None]:
train = pd.concat([train.select_dtypes([], ['object']),train.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')], axis=1)
test = pd.concat([test.select_dtypes([], ['object']),test.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')], axis=1)

In [None]:
train[categorical_columns] = train[categorical_columns].apply(lambda x: x.cat.codes)
test[categorical_columns] = test[categorical_columns].apply(lambda x: x.cat.codes)


In [None]:
train.info(verbose=True, null_counts=True)

In [None]:
train.head()

In [None]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame)
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [None]:
train.info(verbose=True, null_counts=True)

# Final Dataset Prep

In [None]:
y_train = train['Fault']
x_train = train.drop(['Fault'], axis =1)

In [None]:
test.columns

In [None]:
test_idx = test['Id'].values.flatten()
test = test.drop(['Id'],axis=1)

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
rfc_preds = clf.predict(test)

In [None]:
sub = np.column_stack((test_idx,rfc_preds))
sub = pd.DataFrame(sub, columns = ['Id', 'Fault'])
sub['Id'] = sub["Id"].astype("Int32")
sub['Fault'] = sub["Fault"].astype("Int64")
sub.to_csv('RandomForestClassifier.csv',index=False)

# XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
clf = XGBClassifier(use_label_encoder=False, eval_metric='error')
clf.fit(x_train, y_train)
xgb_preds = clf.predict(test)

In [None]:
sub = np.column_stack((test_idx,xgb_preds))
sub = pd.DataFrame(sub, columns = ['Id', 'Fault'])
sub['Id'] = sub["Id"].astype("Int32")
sub['Fault'] = sub["Fault"]. astype("Int64")
sub.to_csv('XGB.csv',index=False)

# LGBM

In [None]:
from lightgbm import LGBMClassifier

In [None]:
clf = LGBMClassifier()
clf.fit(x_train, y_train)
lgbm_preds = clf.predict(test)

In [None]:
sub = np.column_stack((test_idx,lgbm_preds))
sub = pd.DataFrame(sub, columns = ['Id', 'Fault'])
sub['Id'] = sub["Id"].astype("Int32")
sub['Fault'] = sub["Fault"].astype("Int64")
sub.to_csv('LGBM.csv',index=False)

# CatBoost

In [None]:
cat_features = list(range(8, x_train.shape[1]))
cat_features

In [None]:
from catboost import CatBoostClassifier

In [None]:
clf = CatBoostClassifier()

In [None]:
clf.fit(x_train, y_train, cat_features=cat_features)

In [None]:
cat_preds = clf.predict(test)

In [None]:
from catboost import Pool, cv

In [None]:
cv_dataset = Pool(data=x_train,
                  label=y_train,
                  cat_features=cat_features)

In [None]:
params = {"iterations": 100,
          "depth": 16,
          "loss_function": "Logloss",
          "verbose": False}

In [None]:
scores = cv(cv_dataset,
            params,
            fold_count=10, 
            plot="True")

In [None]:
sub = np.column_stack((test_idx,cat_preds))
sub = pd.DataFrame(sub, columns = ['Id', 'Fault'])
sub['Id'] = sub["Id"].astype("Int32")
sub['Fault'] = sub["Fault"].astype("Int64")
sub.to_csv('CATBoost.csv',index=False)

# Ensemble

In [None]:
models = 3.47158
# weights = 1/models
ensembled_prediction = (0.85744/models*rfc_preds)+(0.86904/models*lgbm_preds)+(0.86896/models*xgb_preds)+(0.87614/models*cat_preds)

In [None]:
sub = np.column_stack((test_idx, ensembled_prediction))
sub = pd.DataFrame(sub, columns = ['Id', 'Fault'])
sub.loc[sub['Fault'] >= 0.5, 'Fault'] = 1
sub.loc[sub['Fault'] < 0.5, 'Fault'] = 0
sub['Id'] = sub["Id"].astype("Int32")
sub['Fault'] = sub["Fault"].astype("Int64")
sub.to_csv('ensemble_score_weights.csv',index=False)

In [None]:
models = 4
weights = 1/models
ensembled_prediction = (weights*rfc_preds)+(weights*lgbm_preds)+(weights*xgb_preds)+(weights*cat_preds)

In [None]:
sub = np.column_stack((test_idx, ensembled_prediction))
sub = pd.DataFrame(sub, columns = ['Id', 'Fault'])
sub.loc[sub['Fault'] >= 0.5, 'Fault'] = 1
sub.loc[sub['Fault'] < 0.5, 'Fault'] = 0
sub['Id'] = sub["Id"].astype("Int32")
sub['Fault'] = sub["Fault"].astype("Int64")
sub.to_csv('ensemble_equal_weights.csv',index=False)