# Build a random forest model

## Importing and exploration

In [None]:
# pip install -U scikit-learn scipy matplotlib
# pip install lime
# pip install shap
# pip install numpy==1.21.4
# pip install numba==0.53.0
# pip install librosa
# pip install imbalanced-learn


# Feature engineering and build models
# Proportion of each injured age group depending on class
# Get familiar with PSNI dataset

In [None]:
import pandas as pd
import numpy as np
from operator import itemgetter
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score,roc_auc_score, roc_curve
from sklearn import preprocessing
import seaborn as sns
from scipy.stats import chi2_contingency
import lime
from lime import lime_tabular
import shap

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [None]:
# import dataset
ni_accidents = pd.read_csv('ni_accidents_2021_wgoogle.csv',index_col=0,low_memory=False)
updated_lats_longs = pd.read_csv("updated_lat_longs.csv", index_col = 0)

In [None]:
# join two datasets together
updated_ni_accidents = pd.merge(ni_accidents, updated_lats_longs, how = "left",left_on=['lat','lon'], right_on = ['lats','longs'])
updated_ni_accidents[["address_x", "postcodes_x","location_infos_x"]] = np.where(updated_ni_accidents[["address_y", "postcodes_y","location_infos_y"]].isna(), updated_ni_accidents[["address_x", "postcodes_x","location_infos_x"]], updated_ni_accidents[["address_y","postcodes_y","location_infos_y"]])
updated_ni_accidents.drop(updated_ni_accidents.iloc[:, -5:], inplace = True, axis = 1)
updated_ni_accidents = updated_ni_accidents.rename(columns={"address_x": "address", "postcodes_x": "postcodes","location_infos_x":"location_infos"})

In [None]:
# drop duplicates
ni_accidents_removed = updated_ni_accidents.drop_duplicates()

In [None]:
# drop columns that have too may missing values
nans = pd.DataFrame()
nans["num"] = ni_accidents_removed.isnull().sum()
nans["percent(%)"] = (nans["num"]/ni_accidents_removed.shape[0])*100
row_names = nans[nans["percent(%)"]>50].index
ni_accidents_removed.drop(row_names, inplace = True, axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
ni_accidents_removed.shape

(71647, 36)

In [None]:
ni_accidents_removed.head()

Unnamed: 0,a_year,a_ref,a_District,a_type,a_veh,a_cas,a_wkday,a_day,a_month,a_hour,...,c_sex,c_agegroup,c_sever,c_school,c_vtype,lat,lon,address,postcodes,location_infos
0,2016,1,"Newry, Mourne, Down",Slight injury collision,1,1,FRI,1,1,1,...,Female,17-24,Slightly injured,Other,Car,54.379157,-5.685504,"87 Killyleagh Rd, Downpatrick BT30 9UD, UK",BT30 9UD,"['street_address', 'establishment', 'point_of_..."
1,2016,2,"Derry City, Strabane",Slight injury collision,2,1,FRI,1,1,3,...,,,,,,55.019106,-7.294136,"Foyle Bridge, Londonderry BT48, UK",BT48,"['route', 'plus_code', 'premise', 'street_addr..."
3,2016,2,"Derry City, Strabane",Slight injury collision,2,1,FRI,1,1,3,...,Female,35-44,Slightly injured,Other,Car,55.019106,-7.294136,"Foyle Bridge, Londonderry BT48, UK",BT48,"['route', 'plus_code', 'premise', 'street_addr..."
5,2016,3,"Ards, North Down",Serious injury collision,1,1,FRI,1,1,3,...,Male,35-44,Seriously injured,Other,Car,54.476645,-5.692128,"28 Comber Rd, Killinchy, Newtownards BT23 6PB, UK",BT23 6PB,"['premise', 'street_address', 'establishment',..."
7,2016,4,Belfast City,Slight injury collision,2,2,FRI,1,1,3,...,,,,,,54.593451,-5.887517,"8/367 Beersbridge Rd, Belfast BT5 5EF, UK",BT5 5EF,"['premise', 'establishment', 'health', 'point_..."


## Deal with missing value

In [None]:
# ni_accidents_removed.isnull().sum()
ni_accidents_removed = ni_accidents_removed.drop(["a_gd1", "a_gd2","lat", "lon","v_id", "c_id","a_min","c_school","v_tow"], axis = 1)

In [None]:
# ni_accidents_removed["v_type"]

# ni_accidents_removed[ni_accidents_removed["v_tow"].isna()]

# ni_accidents_removed.isnull().sum()
# df_del_rows = df_del_rows[df_del_rows['maker'].notna()]
# ni_accidents_removed["v_sex"].mode()
# df_replace_mode["maker"].fillna(replace_with,inplace=True)

In [None]:
ni_accidents_removed = ni_accidents_removed[ni_accidents_removed["v_type"].notna()]
# df_del_rows = df_del_rows[df_del_rows['maker'].notna()]


In [None]:
# ni_accidents_removed["v_man"] = ni_accidents_removed["v_man"].fillna("Going ahead other")
ni_accidents_removed = ni_accidents_removed[ni_accidents_removed["v_man"].notna()]


In [None]:
# ni_accidents_removed["v_loc"] = ni_accidents_removed["v_loc"].fillna("On main road")
ni_accidents_removed = ni_accidents_removed[ni_accidents_removed["v_loc"].notna()]


In [None]:
# ni_accidents_removed["v_impact"] = ni_accidents_removed["v_impact"].fillna("Front")
ni_accidents_removed = ni_accidents_removed[ni_accidents_removed["v_impact"].notna()]


In [None]:
ni_accidents_removed["v_sex"] = ni_accidents_removed["v_sex"].fillna("Male")

In [None]:
# ni_accidents_removed["v_agegroup"] = ni_accidents_removed["v_agegroup"].fillna("25-34")
ni_accidents_removed = ni_accidents_removed[ni_accidents_removed["v_agegroup"].notna()]


In [None]:
# ni_accidents_removed["v_hitr"] = ni_accidents_removed["v_hitr"].fillna("Other")
ni_accidents_removed = ni_accidents_removed[ni_accidents_removed["v_hitr"].notna()]


In [None]:
ni_accidents_removed["c_class"] = ni_accidents_removed["c_class"].fillna("Driver")

In [None]:
ni_accidents_removed["c_sex"] = ni_accidents_removed["c_sex"].fillna("Male")

In [None]:
ni_accidents_removed["c_agegroup"] = ni_accidents_removed["c_agegroup"].fillna("25-34")

In [None]:
ni_accidents_removed["c_sever"] = ni_accidents_removed["c_sever"].fillna(ni_accidents_removed["a_type"])

In [None]:
ni_accidents_removed["c_vtype"] = ni_accidents_removed["c_vtype"].fillna("Car")

In [None]:
# ni_accidents_removed["c_sever"].head()


## Anomalies

In [None]:
ni_accidents_removed["v_sex"].replace("4", "Male", inplace = True)

In [None]:
ni_accidents_removed["c_sex"].replace("4", "Male", inplace = True)

## Chi-Square Test

In [None]:
chi_sq_df = ni_accidents_removed.copy()

In [None]:
chi_sq_df = chi_sq_df.drop(["a_type", "general_injury_type","address","a_gd1", "a_gd2","lat", "lon", "postcodes","v_id", "c_id","a_min","c_sever","location_infos", "a_year", "a_ref"], axis = 1)

In [None]:
# @title Default title text
# Build a function for chi_square
def chi_square(column_name, response_variable = "casualty_injury_type", dataset =chi_sq_df, alpha = 0.05):
    cross_tab = pd.crosstab(index = dataset[column_name],
                                     columns = dataset[response_variable])
    cross_tab = np.array(cross_tab)
#     print(cross_tab)
    stat, p, dof, expected = chi2_contingency(cross_tab)

    # interpret p-value
#     alpha = alpha
    print(f"p value is {p}")
    if p <= alpha:
        print('H0 is rejected in favour of HA, hence casualty injury severity is dependent of it ')
    else:
        print('We are unable to reject H0 in favour of HA, hence casualty injury severity is independent of it ')


In [None]:

for var in chi_sq_df.columns.values.tolist()[:-8]:
    print(f"Chi-square test on column {var}: ")
    chi_square(var)
    print("____________________________________\n")


Chi-square test on column a_District: 
p value is 1.0787555592508101e-116
H0 is rejected in favour of HA, hence casualty injury severity is dependent of it 
____________________________________

Chi-square test on column a_veh: 
p value is 0.0
H0 is rejected in favour of HA, hence casualty injury severity is dependent of it 
____________________________________

Chi-square test on column a_cas: 
p value is 2.9720198855753343e-17
H0 is rejected in favour of HA, hence casualty injury severity is dependent of it 
____________________________________

Chi-square test on column a_wkday: 
p value is 1.1897245838373144e-38
H0 is rejected in favour of HA, hence casualty injury severity is dependent of it 
____________________________________

Chi-square test on column a_day: 
p value is 0.0052303917182350505
H0 is rejected in favour of HA, hence casualty injury severity is dependent of it 
____________________________________

Chi-square test on column a_month: 
p value is 0.001276682628832742