In [1]:
#dependencies and load data
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree

#read in cleaned data
crashes_df = pd.read_csv("..\Resources\crashes_cleaned_df.csv")
crashes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56414 entries, 0 to 56413
Columns: 159 entries, Unnamed: 0 to STAT_DIV_NAME_Metro
dtypes: float64(7), int64(151), object(1)
memory usage: 68.4+ MB


In [2]:
# Generate summary statistics
crashes_df = crashes_df.drop(columns=["ACCIDENT_DATE"])
crashes_df.describe()

Unnamed: 0.1,Unnamed: 0,ACCIDENT_TIME,LONGITUDE,LATITUDE,TOTAL_PERSONS,INJ_OR_FATAL,FATALITY,SERIOUSINJURY,OTHERINJURY,NONINJURED,...,"RMA_ALL_Arterial Other,Arterial Highway","RMA_ALL_Arterial Other,Local Road",RMA_ALL_Freeway,"RMA_ALL_Freeway,Arterial Other",RMA_ALL_Local Road,"RMA_ALL_Local Road,Arterial Highway","RMA_ALL_Local Road,Arterial Other",RMA_ALL_Other,STAT_DIV_NAME_Country,STAT_DIV_NAME_Metro
count,56414.0,56414.0,56414.0,56414.0,56414.0,56414.0,56414.0,56414.0,56414.0,56414.0,...,56414.0,56414.0,56414.0,56414.0,56414.0,56414.0,56414.0,56414.0,56414.0,56414.0
mean,30299.422909,13.24033,144.743484,-37.659962,2.399209,1.320647,0.021236,0.46726,0.832152,1.02588,...,0.013986,0.134807,0.064913,0.007055,0.355089,0.048871,0.026235,0.014979,0.29202,0.70798
std,17515.883746,5.115048,5.785226,1.486344,1.505209,0.781965,0.152419,0.660557,0.793272,1.273803,...,0.117433,0.34152,0.246374,0.083698,0.478545,0.2156,0.159834,0.121468,0.454695,0.454695
min,0.0,0.0,-1.0,-39.023993,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15102.25,9.0,144.840394,-37.957273,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,30313.5,14.0,145.01338,-37.81612,2.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,45432.75,17.0,145.20511,-37.695981,3.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
max,60688.0,23.0,149.757513,-1.0,89.0,27.0,4.0,16.0,25.0,87.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
#print column names for ease of copy pasting them
columns_list = crashes_df.columns.to_list()
print(columns_list)

['Unnamed: 0', 'ACCIDENT_TIME', 'LONGITUDE', 'LATITUDE', 'TOTAL_PERSONS', 'INJ_OR_FATAL', 'FATALITY', 'SERIOUSINJURY', 'OTHERINJURY', 'NONINJURED', 'MALES', 'FEMALES', 'BICYCLIST', 'PASSENGER', 'DRIVER', 'PEDESTRIAN', 'PILLION', 'MOTORIST', 'PED_CYCLIST_5_12', 'PED_CYCLIST_13_18', 'OLD_PEDESTRIAN', 'OLD_DRIVER', 'YOUNG_DRIVER', 'UNLICENCSED', 'NO_OF_VEHICLES', 'HEAVYVEHICLE', 'PASSENGERVEHICLE', 'MOTORCYCLE', 'PUBLICVEHICLE', 'ABS_CODE_ABS to receive accident', 'ABS_CODE_Heart attk/suicide/death by nat.causes', 'ABS_CODE_Non ABS accident', 'ACCIDENT_STATUS_Discarded', 'ACCIDENT_STATUS_Finished', 'ACCIDENT_STATUS_Private Property', 'ACCIDENT_STATUS_Reopened', 'ACCIDENT_STATUS_Unfinished', 'ALCOHOLTIME_No', 'ALCOHOLTIME_Yes', 'ACCIDENT_TYPE_Collision with a fixed object', 'ACCIDENT_TYPE_Collision with vehicle', 'ACCIDENT_TYPE_Fall from or in moving vehicle', 'ACCIDENT_TYPE_No collision and no object struck', 'ACCIDENT_TYPE_Other accident', 'ACCIDENT_TYPE_Struck Pedestrian', 'ACCIDENT_TYP

In [4]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
crashes_scaled = StandardScaler().fit_transform(crashes_df[columns_list])
crashes_scaled[:5]

array([[-1.72984086e+00, -1.61101201e+00, -6.48307706e-02,
        -3.48110091e-01, -2.65220885e-01, -4.10057003e-01,
        -1.39326912e-01, -7.07378381e-01,  2.11591722e-01,
        -2.03173788e-02, -1.32530372e+00,  4.79756160e-02,
        -3.17219980e-01,  4.21462775e-01, -5.56969704e-01,
        -2.91197474e-01, -7.07801249e-02, -3.95176742e-01,
        -8.74037061e-02, -1.21623684e-01, -1.36644733e-01,
        -2.66422934e-01, -5.83354018e-01, -1.88135603e-01,
        -1.09042579e+00, -2.12995929e-01, -5.05114725e-01,
        -3.95565878e-01, -1.12596975e-01,  7.70573806e-02,
        -5.44889644e-02, -5.43250959e-02, -1.57552198e-02,
         7.75215709e-02, -2.69684729e-02, -1.26317189e-02,
        -6.97334660e-02, -1.31456519e+00,  1.31456519e+00,
         2.30655332e+00, -1.34854961e+00, -7.75215709e-02,
        -2.01253954e-01, -2.85668663e-02, -3.07552539e-01,
        -9.96781372e-02, -2.16865023e-01, -9.88586690e-02,
        -3.72654673e-01, -4.00320714e-01, -4.12372179e-0

In [5]:
# Create a DataFrame with the scaled data
crashes_scaled_df=pd.DataFrame(crashes_scaled, columns=columns_list)
crashes_scaled_df.head()

Unnamed: 0.1,Unnamed: 0,ACCIDENT_TIME,LONGITUDE,LATITUDE,TOTAL_PERSONS,INJ_OR_FATAL,FATALITY,SERIOUSINJURY,OTHERINJURY,NONINJURED,...,"RMA_ALL_Arterial Other,Arterial Highway","RMA_ALL_Arterial Other,Local Road",RMA_ALL_Freeway,"RMA_ALL_Freeway,Arterial Other",RMA_ALL_Local Road,"RMA_ALL_Local Road,Arterial Highway","RMA_ALL_Local Road,Arterial Other",RMA_ALL_Other,STAT_DIV_NAME_Country,STAT_DIV_NAME_Metro
0,-1.729841,-1.611012,-0.064831,-0.34811,-0.265221,-0.410057,-0.139327,-0.707378,0.211592,-0.020317,...,-0.119098,-0.39473,-0.263475,-0.084292,1.347662,-0.226676,-0.164138,-0.123314,1.557057,-1.557057
1,-1.729784,-0.437992,0.058168,0.002631,-0.929586,-0.410057,-0.139327,-0.707378,0.211592,-0.805375,...,-0.119098,2.53338,-0.263475,-0.084292,-0.742026,-0.226676,-0.164138,-0.123314,-0.642237,0.642237
2,-1.729727,-0.046985,0.026094,-0.104986,-0.265221,-0.410057,-0.139327,-0.707378,0.211592,-0.020317,...,-0.119098,-0.39473,-0.263475,-0.084292,1.347662,-0.226676,-0.164138,-0.123314,-0.642237,0.642237
3,-1.729612,-0.633495,-0.257966,-0.239445,-0.929586,-0.410057,-0.139327,0.806508,-1.04902,-0.805375,...,-0.119098,-0.39473,-0.263475,-0.084292,-0.742026,-0.226676,-0.164138,-0.123314,1.557057,-1.557057
4,-1.729555,-1.024502,0.301375,-0.355898,0.399144,-0.410057,-0.139327,-0.707378,0.211592,0.764741,...,-0.119098,-0.39473,-0.263475,-0.084292,-0.742026,-0.226676,6.092419,-0.123314,1.557057,-1.557057


In [6]:
# Create a list with the number of k-values, reduced to 11 for speed. Have saved the graph from 87
k = list(range(1, 11))

In [7]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list

for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(crashes_scaled_df)
    inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
found 0 physical cores < 1
  File "C:\Users\evang\anaconda3\envs\dev\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [8]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

In [9]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [11]:
#load in the original data, and drop unecessary columns and nan values
original_data = pd.read_csv("../Resources/Road_Crashes_for_five_Years_Victoria.csv")
original_data = original_data.drop(columns=["X", "Y", "ACCIDENT_NO", "SRNS", "SRNS_ALL", "DIVIDED", "DIVIDED_ALL", "UNKNOWN", "NODE_ID", "OBJECTID"])
original_data = original_data.dropna()
original_data = original_data.drop(original_data.loc[original_data['REGION_NAME'] == " "].index)

In [12]:
#split data into x and y ready for supervised modeling
X = crashes_df.drop(columns=['SEVERITY_Fatal accident', 'SEVERITY_Non injury accident', 'SEVERITY_Other injury accident', 'SEVERITY_Serious injury accident'])
y = original_data["SEVERITY"]

In [13]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [14]:
# Import the KNeighborsClassifier module from sklearn
# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

In [15]:
# Train the model using the training data
knn.fit(X_train, y_train)

In [16]:
# Create predictions using the testing data
y_pred = knn.predict(X_test)

In [17]:
# Print the classification report comparing the testing data to the model 
print(classification_report(y_test, y_pred))

                         precision    recall  f1-score   support

         Fatal accident       0.02      0.03      0.02       275
    Non injury accident       0.00      0.00      0.00         1
  Other injury accident       0.61      0.67      0.64      8213
Serious injury accident       0.45      0.38      0.41      5615

               accuracy                           0.54     14104
              macro avg       0.27      0.27      0.27     14104
           weighted avg       0.54      0.54      0.54     14104



In [18]:
#create a tree model
treem = tree.DecisionTreeClassifier()

In [19]:
# Fit the model
treem = treem.fit(X_train, y_train)

In [20]:
# Making predictions using the testing data
treem_predictions = treem.predict(X_test)

In [21]:
#print the classification report for the tree model
print(classification_report(y_test, treem_predictions))

                         precision    recall  f1-score   support

         Fatal accident       1.00      1.00      1.00       275
    Non injury accident       1.00      1.00      1.00         1
  Other injury accident       1.00      1.00      1.00      8213
Serious injury accident       1.00      1.00      1.00      5615

               accuracy                           1.00     14104
              macro avg       1.00      1.00      1.00     14104
           weighted avg       1.00      1.00      1.00     14104



In [22]:
#having a look at severity values- determine we need to not worry about non-injury accidents
original_data["SEVERITY"].value_counts()

SEVERITY
Other injury accident      32903
Serious injury accident    22371
Fatal accident              1138
Non injury accident            2
Name: count, dtype: int64

In [23]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [24]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train, y_train.ravel())

In [25]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test)

In [26]:
print(classification_report(y_test, rf_predictions))

  _warn_prf(average, modifier, msg_start, len(result))


                         precision    recall  f1-score   support

         Fatal accident       1.00      1.00      1.00       275
    Non injury accident       0.00      0.00      0.00         1
  Other injury accident       1.00      1.00      1.00      8213
Serious injury accident       1.00      1.00      1.00      5615

               accuracy                           1.00     14104
              macro avg       0.75      0.75      0.75     14104
           weighted avg       1.00      1.00      1.00     14104



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
#create a new reduced df with columns we care about more
reduced_crashes_df = crashes_df[['DAY_OF_WEEK_1', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5',
                                 'DAY_OF_WEEK_6', 'DAY_OF_WEEK_7', 'ACCIDENT_TYPE_Collision with a fixed object',
                                 'ACCIDENT_TYPE_Collision with vehicle', 'ACCIDENT_TYPE_Fall from or in moving vehicle',
                                 'ACCIDENT_TYPE_No collision and no object struck', 'ACCIDENT_TYPE_Other accident',
                                 'ACCIDENT_TYPE_Struck Pedestrian', 'ACCIDENT_TYPE_Struck animal',
                                 'ACCIDENT_TYPE_Vehicle overturned (no collision)',
                                 'ACCIDENT_TYPE_collision with some other object', 'LIGHT_CONDITION_Dark No street lights',
                                 'LIGHT_CONDITION_Dark Street lights off', 'LIGHT_CONDITION_Dark Street lights on',
                                 'LIGHT_CONDITION_Dark Street lights unknown', 'LIGHT_CONDITION_Day',
                                 'LIGHT_CONDITION_Dusk/Dawn', 'LIGHT_CONDITION_Unk.', 'ROAD_GEOMETRY_Cross intersection',
                                 'ROAD_GEOMETRY_Dead end', 'ROAD_GEOMETRY_Multiple intersection',
                                 'ROAD_GEOMETRY_Not at intersection', 'ROAD_GEOMETRY_Private property',
                                 'ROAD_GEOMETRY_Road closure', 'ROAD_GEOMETRY_T intersection', 'ROAD_GEOMETRY_Unknown',
                                 'ROAD_GEOMETRY_Y intersection','SPEED_ZONE_100 km/hr', 'SPEED_ZONE_110 km/hr',
                                 'SPEED_ZONE_40 km/hr', 'SPEED_ZONE_50 km/hr', 'SPEED_ZONE_60 km/hr', 'SPEED_ZONE_70 km/hr',
                                 'SPEED_ZONE_80 km/hr', 'SPEED_ZONE_90 km/hr', 'SPEED_ZONE_Camping grounds or off road',
                                 'SPEED_ZONE_Not known', 'SPEED_ZONE_Other speed limit', 'TOTAL_PERSONS', 'INJ_OR_FATAL',
                                 'MALES', 'FEMALES', 'UNLICENCSED', 'RMA_Arterial Highway', 'RMA_Arterial Other',
                                 'RMA_Freeway', 'RMA_Local Road', 'RMA_Non Arterial', 'RMA_ALL_Arterial Highway',
                                 'RMA_ALL_Arterial Highway,Arterial Other', 'RMA_ALL_Arterial Highway,Local Road',
                                 'RMA_ALL_Arterial Other', 'RMA_ALL_Arterial Other,Arterial Highway',
                                 'RMA_ALL_Arterial Other,Local Road', 'RMA_ALL_Freeway', 'RMA_ALL_Freeway,Arterial Other',
                                 'RMA_ALL_Local Road', 'RMA_ALL_Local Road,Arterial Highway',
                                 'RMA_ALL_Local Road,Arterial Other', 'RMA_ALL_Other', 'SEVERITY_Fatal accident',
                                 'SEVERITY_Non injury accident', 'SEVERITY_Other injury accident',
                                 'SEVERITY_Serious injury accident', 'REGION_NAME_ALL_EASTERN REGION,EASTERN REGION',
                                 'REGION_NAME_ALL_METROPOLITAN NORTH WEST REGION',
                                 'REGION_NAME_ALL_METROPOLITAN SOUTH EAST REGION', 'REGION_NAME_ALL_NORTH EASTERN REGION',
                                 'REGION_NAME_ALL_NORTHERN REGION', 'REGION_NAME_ALL_Other',
                                 'REGION_NAME_ALL_SOUTH WESTERN REGION', 'REGION_NAME_ALL_WESTERN REGION', 
                                 'STAT_DIV_NAME_Country', 'STAT_DIV_NAME_Metro']]
reduced_crashes_df.head()

Unnamed: 0,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,ACCIDENT_TYPE_Collision with a fixed object,ACCIDENT_TYPE_Collision with vehicle,ACCIDENT_TYPE_Fall from or in moving vehicle,...,"REGION_NAME_ALL_EASTERN REGION,EASTERN REGION",REGION_NAME_ALL_METROPOLITAN NORTH WEST REGION,REGION_NAME_ALL_METROPOLITAN SOUTH EAST REGION,REGION_NAME_ALL_NORTH EASTERN REGION,REGION_NAME_ALL_NORTHERN REGION,REGION_NAME_ALL_Other,REGION_NAME_ALL_SOUTH WESTERN REGION,REGION_NAME_ALL_WESTERN REGION,STAT_DIV_NAME_Country,STAT_DIV_NAME_Metro
0,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
1,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,0,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0


### Why did we chose these columns? 
We chose Day of Week, Accident Type, Light Condidition, Road Geometry, Speed zone, males, females, unlicensed, rma, severity and region for our reduced data. These were chosen using our common sense, as well as using our knowledge of correlation from our previous analysis. We wanted to see how acurate the model could be with reduced geographical input. 

In [28]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
red_scaler = StandardScaler().fit(reduced_crashes_df[['DAY_OF_WEEK_1', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5',
                                 'DAY_OF_WEEK_6', 'DAY_OF_WEEK_7', 'ACCIDENT_TYPE_Collision with a fixed object',
                                 'ACCIDENT_TYPE_Collision with vehicle', 'ACCIDENT_TYPE_Fall from or in moving vehicle',
                                 'ACCIDENT_TYPE_No collision and no object struck', 'ACCIDENT_TYPE_Other accident',
                                 'ACCIDENT_TYPE_Struck Pedestrian', 'ACCIDENT_TYPE_Struck animal',
                                 'ACCIDENT_TYPE_Vehicle overturned (no collision)',
                                 'ACCIDENT_TYPE_collision with some other object', 'LIGHT_CONDITION_Dark No street lights',
                                 'LIGHT_CONDITION_Dark Street lights off', 'LIGHT_CONDITION_Dark Street lights on',
                                 'LIGHT_CONDITION_Dark Street lights unknown', 'LIGHT_CONDITION_Day',
                                 'LIGHT_CONDITION_Dusk/Dawn', 'LIGHT_CONDITION_Unk.', 'ROAD_GEOMETRY_Cross intersection',
                                 'ROAD_GEOMETRY_Dead end', 'ROAD_GEOMETRY_Multiple intersection',
                                 'ROAD_GEOMETRY_Not at intersection', 'ROAD_GEOMETRY_Private property',
                                 'ROAD_GEOMETRY_Road closure', 'ROAD_GEOMETRY_T intersection', 'ROAD_GEOMETRY_Unknown',
                                 'ROAD_GEOMETRY_Y intersection','SPEED_ZONE_100 km/hr', 'SPEED_ZONE_110 km/hr',
                                 'SPEED_ZONE_40 km/hr', 'SPEED_ZONE_50 km/hr', 'SPEED_ZONE_60 km/hr', 'SPEED_ZONE_70 km/hr',
                                 'SPEED_ZONE_80 km/hr', 'SPEED_ZONE_90 km/hr', 'SPEED_ZONE_Camping grounds or off road',
                                 'SPEED_ZONE_Not known', 'SPEED_ZONE_Other speed limit', 'TOTAL_PERSONS', 'INJ_OR_FATAL',
                                 'MALES', 'FEMALES', 'UNLICENCSED', 'RMA_Arterial Highway', 'RMA_Arterial Other',
                                 'RMA_Freeway', 'RMA_Local Road', 'RMA_Non Arterial', 'RMA_ALL_Arterial Highway',
                                 'RMA_ALL_Arterial Highway,Arterial Other', 'RMA_ALL_Arterial Highway,Local Road',
                                 'RMA_ALL_Arterial Other', 'RMA_ALL_Arterial Other,Arterial Highway',
                                 'RMA_ALL_Arterial Other,Local Road', 'RMA_ALL_Freeway', 'RMA_ALL_Freeway,Arterial Other',
                                 'RMA_ALL_Local Road', 'RMA_ALL_Local Road,Arterial Highway',
                                 'RMA_ALL_Local Road,Arterial Other', 'RMA_ALL_Other', 'SEVERITY_Fatal accident',
                                 'SEVERITY_Non injury accident', 'SEVERITY_Other injury accident',
                                 'SEVERITY_Serious injury accident', 'REGION_NAME_ALL_EASTERN REGION,EASTERN REGION',
                                 'REGION_NAME_ALL_METROPOLITAN NORTH WEST REGION',
                                 'REGION_NAME_ALL_METROPOLITAN SOUTH EAST REGION', 'REGION_NAME_ALL_NORTH EASTERN REGION',
                                 'REGION_NAME_ALL_NORTHERN REGION', 'REGION_NAME_ALL_Other',
                                 'REGION_NAME_ALL_SOUTH WESTERN REGION', 'REGION_NAME_ALL_WESTERN REGION', 
                                 'STAT_DIV_NAME_Country', 'STAT_DIV_NAME_Metro']])
reduced_crashes_scaled = StandardScaler().fit_transform(reduced_crashes_df[['DAY_OF_WEEK_1', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5',
                                 'DAY_OF_WEEK_6', 'DAY_OF_WEEK_7', 'ACCIDENT_TYPE_Collision with a fixed object',
                                 'ACCIDENT_TYPE_Collision with vehicle', 'ACCIDENT_TYPE_Fall from or in moving vehicle',
                                 'ACCIDENT_TYPE_No collision and no object struck', 'ACCIDENT_TYPE_Other accident',
                                 'ACCIDENT_TYPE_Struck Pedestrian', 'ACCIDENT_TYPE_Struck animal',
                                 'ACCIDENT_TYPE_Vehicle overturned (no collision)',
                                 'ACCIDENT_TYPE_collision with some other object', 'LIGHT_CONDITION_Dark No street lights',
                                 'LIGHT_CONDITION_Dark Street lights off', 'LIGHT_CONDITION_Dark Street lights on',
                                 'LIGHT_CONDITION_Dark Street lights unknown', 'LIGHT_CONDITION_Day',
                                 'LIGHT_CONDITION_Dusk/Dawn', 'LIGHT_CONDITION_Unk.', 'ROAD_GEOMETRY_Cross intersection',
                                 'ROAD_GEOMETRY_Dead end', 'ROAD_GEOMETRY_Multiple intersection',
                                 'ROAD_GEOMETRY_Not at intersection', 'ROAD_GEOMETRY_Private property',
                                 'ROAD_GEOMETRY_Road closure', 'ROAD_GEOMETRY_T intersection', 'ROAD_GEOMETRY_Unknown',
                                 'ROAD_GEOMETRY_Y intersection','SPEED_ZONE_100 km/hr', 'SPEED_ZONE_110 km/hr',
                                 'SPEED_ZONE_40 km/hr', 'SPEED_ZONE_50 km/hr', 'SPEED_ZONE_60 km/hr', 'SPEED_ZONE_70 km/hr',
                                 'SPEED_ZONE_80 km/hr', 'SPEED_ZONE_90 km/hr', 'SPEED_ZONE_Camping grounds or off road',
                                 'SPEED_ZONE_Not known', 'SPEED_ZONE_Other speed limit', 'TOTAL_PERSONS', 'INJ_OR_FATAL',
                                 'MALES', 'FEMALES', 'UNLICENCSED', 'RMA_Arterial Highway', 'RMA_Arterial Other',
                                 'RMA_Freeway', 'RMA_Local Road', 'RMA_Non Arterial', 'RMA_ALL_Arterial Highway',
                                 'RMA_ALL_Arterial Highway,Arterial Other', 'RMA_ALL_Arterial Highway,Local Road',
                                 'RMA_ALL_Arterial Other', 'RMA_ALL_Arterial Other,Arterial Highway',
                                 'RMA_ALL_Arterial Other,Local Road', 'RMA_ALL_Freeway', 'RMA_ALL_Freeway,Arterial Other',
                                 'RMA_ALL_Local Road', 'RMA_ALL_Local Road,Arterial Highway',
                                 'RMA_ALL_Local Road,Arterial Other', 'RMA_ALL_Other', 'SEVERITY_Fatal accident',
                                 'SEVERITY_Non injury accident', 'SEVERITY_Other injury accident',
                                 'SEVERITY_Serious injury accident', 'REGION_NAME_ALL_EASTERN REGION,EASTERN REGION',
                                 'REGION_NAME_ALL_METROPOLITAN NORTH WEST REGION',
                                 'REGION_NAME_ALL_METROPOLITAN SOUTH EAST REGION', 'REGION_NAME_ALL_NORTH EASTERN REGION',
                                 'REGION_NAME_ALL_NORTHERN REGION', 'REGION_NAME_ALL_Other',
                                 'REGION_NAME_ALL_SOUTH WESTERN REGION', 'REGION_NAME_ALL_WESTERN REGION', 
                                 'STAT_DIV_NAME_Country', 'STAT_DIV_NAME_Metro']])
reduced_crashes_scaled[:5]

array([[-3.72654673e-01, -4.00320714e-01, -4.12372179e-01,
         2.38913514e+00, -4.20956796e-01, -4.34844876e-01,
        -3.96707000e-01,  2.30655332e+00, -1.34854961e+00,
        -7.75215709e-02, -2.01253954e-01, -2.85668663e-02,
        -3.07552539e-01, -9.96781372e-02, -2.16865023e-01,
        -9.88586690e-02, -2.44287824e-01, -4.82444132e-02,
         2.29860438e+00, -9.98593637e-02, -1.44914147e+00,
        -2.71502968e-01, -1.64252276e-01, -5.38373593e-01,
        -3.54984337e-02, -1.45275037e-01,  9.59755488e-01,
        -1.45862405e-02, -5.95427994e-03, -5.40482132e-01,
        -6.65835263e-02, -4.57827879e-02, -4.18707717e-01,
        -1.07543317e-01, -2.57225999e-01, -4.38125773e-01,
         1.40488316e+00, -2.53286548e-01, -4.19175456e-01,
        -5.14604966e-02, -9.46591206e-02, -2.42449123e-01,
        -5.70476346e-02, -2.65220885e-01, -4.10057003e-01,
        -1.32530372e+00,  4.79756160e-02, -1.88135603e-01,
        -4.89652062e-01, -7.68245941e-01, -2.94986170e-0

In [29]:
#split into x and y
red_X = pd.DataFrame(reduced_crashes_scaled, columns=[['DAY_OF_WEEK_1', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5',
                                 'DAY_OF_WEEK_6', 'DAY_OF_WEEK_7', 'ACCIDENT_TYPE_Collision with a fixed object',
                                 'ACCIDENT_TYPE_Collision with vehicle', 'ACCIDENT_TYPE_Fall from or in moving vehicle',
                                 'ACCIDENT_TYPE_No collision and no object struck', 'ACCIDENT_TYPE_Other accident',
                                 'ACCIDENT_TYPE_Struck Pedestrian', 'ACCIDENT_TYPE_Struck animal',
                                 'ACCIDENT_TYPE_Vehicle overturned (no collision)',
                                 'ACCIDENT_TYPE_collision with some other object', 'LIGHT_CONDITION_Dark No street lights',
                                 'LIGHT_CONDITION_Dark Street lights off', 'LIGHT_CONDITION_Dark Street lights on',
                                 'LIGHT_CONDITION_Dark Street lights unknown', 'LIGHT_CONDITION_Day',
                                 'LIGHT_CONDITION_Dusk/Dawn', 'LIGHT_CONDITION_Unk.', 'ROAD_GEOMETRY_Cross intersection',
                                 'ROAD_GEOMETRY_Dead end', 'ROAD_GEOMETRY_Multiple intersection',
                                 'ROAD_GEOMETRY_Not at intersection', 'ROAD_GEOMETRY_Private property',
                                 'ROAD_GEOMETRY_Road closure', 'ROAD_GEOMETRY_T intersection', 'ROAD_GEOMETRY_Unknown',
                                 'ROAD_GEOMETRY_Y intersection','SPEED_ZONE_100 km/hr', 'SPEED_ZONE_110 km/hr',
                                 'SPEED_ZONE_40 km/hr', 'SPEED_ZONE_50 km/hr', 'SPEED_ZONE_60 km/hr', 'SPEED_ZONE_70 km/hr',
                                 'SPEED_ZONE_80 km/hr', 'SPEED_ZONE_90 km/hr', 'SPEED_ZONE_Camping grounds or off road',
                                 'SPEED_ZONE_Not known', 'SPEED_ZONE_Other speed limit', 'TOTAL_PERSONS', 'INJ_OR_FATAL',
                                 'MALES', 'FEMALES', 'UNLICENCSED', 'RMA_Arterial Highway', 'RMA_Arterial Other',
                                 'RMA_Freeway', 'RMA_Local Road', 'RMA_Non Arterial', 'RMA_ALL_Arterial Highway',
                                 'RMA_ALL_Arterial Highway,Arterial Other', 'RMA_ALL_Arterial Highway,Local Road',
                                 'RMA_ALL_Arterial Other', 'RMA_ALL_Arterial Other,Arterial Highway',
                                 'RMA_ALL_Arterial Other,Local Road', 'RMA_ALL_Freeway', 'RMA_ALL_Freeway,Arterial Other',
                                 'RMA_ALL_Local Road', 'RMA_ALL_Local Road,Arterial Highway',
                                 'RMA_ALL_Local Road,Arterial Other', 'RMA_ALL_Other', 'SEVERITY_Fatal accident',
                                 'SEVERITY_Non injury accident', 'SEVERITY_Other injury accident',
                                 'SEVERITY_Serious injury accident', 'REGION_NAME_ALL_EASTERN REGION,EASTERN REGION',
                                 'REGION_NAME_ALL_METROPOLITAN NORTH WEST REGION',
                                 'REGION_NAME_ALL_METROPOLITAN SOUTH EAST REGION', 'REGION_NAME_ALL_NORTH EASTERN REGION',
                                 'REGION_NAME_ALL_NORTHERN REGION', 'REGION_NAME_ALL_Other',
                                 'REGION_NAME_ALL_SOUTH WESTERN REGION', 'REGION_NAME_ALL_WESTERN REGION', 
                                 'STAT_DIV_NAME_Country', 'STAT_DIV_NAME_Metro']])
red_X.head()

Unnamed: 0,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,ACCIDENT_TYPE_Collision with a fixed object,ACCIDENT_TYPE_Collision with vehicle,ACCIDENT_TYPE_Fall from or in moving vehicle,...,"REGION_NAME_ALL_EASTERN REGION,EASTERN REGION",REGION_NAME_ALL_METROPOLITAN NORTH WEST REGION,REGION_NAME_ALL_METROPOLITAN SOUTH EAST REGION,REGION_NAME_ALL_NORTH EASTERN REGION,REGION_NAME_ALL_NORTHERN REGION,REGION_NAME_ALL_Other,REGION_NAME_ALL_SOUTH WESTERN REGION,REGION_NAME_ALL_WESTERN REGION,STAT_DIV_NAME_Country,STAT_DIV_NAME_Metro
0,-0.372655,-0.400321,-0.412372,2.389135,-0.420957,-0.434845,-0.396707,2.306553,-1.34855,-0.077522,...,-0.242203,-0.73466,-0.740131,-0.238448,-0.238987,-0.064677,3.474756,-0.233382,1.557057,-1.557057
1,-0.372655,-0.400321,-0.412372,2.389135,-0.420957,-0.434845,-0.396707,2.306553,-1.34855,-0.077522,...,-0.242203,1.361174,-0.740131,-0.238448,-0.238987,-0.064677,-0.28779,-0.233382,-0.642237,0.642237
2,-0.372655,-0.400321,-0.412372,2.389135,-0.420957,-0.434845,-0.396707,-0.433547,0.741537,-0.077522,...,-0.242203,1.361174,-0.740131,-0.238448,-0.238987,-0.064677,-0.28779,-0.233382,-0.642237,0.642237
3,-0.372655,-0.400321,-0.412372,-0.418562,2.375541,-0.434845,-0.396707,-0.433547,-1.34855,-0.077522,...,-0.242203,-0.73466,-0.740131,-0.238448,-0.238987,-0.064677,3.474756,-0.233382,1.557057,-1.557057
4,-0.372655,-0.400321,-0.412372,-0.418562,2.375541,-0.434845,-0.396707,-0.433547,0.741537,-0.077522,...,4.128766,-0.73466,-0.740131,-0.238448,-0.238987,-0.064677,-0.28779,-0.233382,1.557057,-1.557057


In [30]:
#print the LGA names with < 500 occurances in the data
values = original_data['LGA_NAME'].value_counts()
print(values[values < 500])

LGA_NAME
MILDURA               474
MACEDON RANGES        438
MURRINDINDI           421
COLAC OTWAY           387
MOORABOOL             385
SOUTH GIPPSLAND       372
SURF COAST            367
CAMPASPE              364
BASS COAST            341
MOIRA                 289
WANGARATTA            287
GOLDEN PLAINS         285
WARRNAMBOOL           252
WODONGA               228
CORANGAMITE           221
MANSFIELD             204
MOYNE                 200
ALPINE                195
HEPBURN               191
GLENELG               186
STRATHBOGIE           180
SWAN HILL             174
HORSHAM               172
INDIGO                171
BENALLA               167
NORTHERN GRAMPIANS    161
MOUNT ALEXANDER       158
SOUTHERN GRAMPIANS    141
TOWONG                134
PYRENEES              127
ARARAT                126
CENTRAL GOLDFIELDS    111
LODDON                105
GANNAWARRA            102
BULOKE                 67
HINDMARSH              49
WEST WIMMERA           46
YARRIAMBIACK           43
(MO

In [31]:
#we want to see if binning will help the model predict LGA's better- threshhold of 500
LGA_to_replace = ["MILDURA",
"MACEDON RANGES",
"MURRINDINDI",
"COLAC OTWAY",
"MOORABOOL",
"SOUTH GIPPSLAND",
"SURF COAST",
"CAMPASPE",
"BASS COAST",
"MOIRA",
"WANGARATTA",
"GOLDEN PLAINS",
"WARRNAMBOOL",
"WODONGA",
"CORANGAMITE",
"MANSFIELD",
"MOYNE",
"ALPINE",
"HEPBURN",
"GLENELG",
"STRATHBOGIE",
"SWAN HILL",
"HORSHAM",
"INDIGO",
"BENALLA",
"NORTHERN GRAMPIANS",
"MOUNT ALEXANDER",
"SOUTHERN GRAMPIANS",
"TOWONG",
"PYRENEES",
"ARARAT",
"CENTRAL GOLDFIELDS",
"LODDON",
"GANNAWARRA",
"BULOKE",
"HINDMARSH",
"WEST WIMMERA",
"YARRIAMBIACK",
"(MOUNT HOTHAM)",
"QUEENSCLIFFE",
"(LAKE MOUNTAIN)",
"(MOUNT BULLER)",
"(FALLS CREEK)",
"(MOUNT BAW BAW)",
"(FRENCH ISLAND)",
"",
"(MOUNT STIRLING)"]

for lga in LGA_to_replace:
    original_data['LGA_NAME'] = original_data['LGA_NAME'].replace(lga,"Other")

# Check to make sure binning was successful
original_data['LGA_NAME'].value_counts()

LGA_NAME
Other                   8383
MELBOURNE               3049
CASEY                   2420
GEELONG                 2273
DANDENONG               1936
HUME                    1870
BRIMBANK                1683
MONASH                  1663
WHITTLESEA              1656
MORELAND                1619
YARRA RANGES            1582
YARRA                   1395
DAREBIN                 1377
KINGSTON                1355
WYNDHAM                 1270
BOROONDARA              1231
WHITEHORSE              1208
MORNINGTON PENINSULA    1163
STONNINGTON             1162
KNOX                    1134
PORT PHILLIP            1103
GLEN EIRA               1083
BENDIGO                 1077
BALLARAT                1040
FRANKSTON               1004
MELTON                   955
MOONEE VALLEY            905
CARDINIA                 882
MAROONDAH                851
BANYULE                  841
MARIBYRNONG              792
HOBSONS BAY              770
MANNINGHAM               732
SHEPPARTON               730
BAYSI

In [32]:
#select y data
red_y = original_data['REGION_NAME_ALL']

In [33]:
#split into test and train
red_X_train, red_X_test, red_y_train, red_y_test = train_test_split(red_X, red_y, random_state=42)

In [34]:
#create decision tree model
red_treem = tree.DecisionTreeClassifier()

In [35]:
#fit the model
red_treem = red_treem.fit(red_X_train, red_y_train.ravel())

In [36]:
#predict using the test data
red_treem_predictions = red_treem.predict(red_X_test)

In [37]:
#print the classification report
print(classification_report(red_y_test, red_treem_predictions))

  _warn_prf(average, modifier, msg_start, len(result))


                                                               precision    recall  f1-score   support

                                EASTERN REGION,EASTERN REGION       1.00      1.00      1.00       796
 EASTERN REGION,EASTERN REGION,METROPOLITAN SOUTH EAST REGION       0.33      0.50      0.40         2
                               METROPOLITAN NORTH WEST REGION       1.00      1.00      1.00      4955
METROPOLITAN NORTH WEST REGION,METROPOLITAN SOUTH EAST REGION       0.00      0.00      0.00         0
                               METROPOLITAN SOUTH EAST REGION       1.00      1.00      1.00      4934
METROPOLITAN SOUTH EAST REGION,METROPOLITAN NORTH WEST REGION       1.00      0.97      0.99        40
                                         NORTH EASTERN REGION       1.00      1.00      1.00       779
          NORTH EASTERN REGION,METROPOLITAN SOUTH EAST REGION       0.00      0.00      0.00         2
                         NORTH EASTERN REGION,NORTHERN REGION       0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
print(red_X.columns)

MultiIndex([(                                  'DAY_OF_WEEK_1',),
            (                                  'DAY_OF_WEEK_2',),
            (                                  'DAY_OF_WEEK_3',),
            (                                  'DAY_OF_WEEK_4',),
            (                                  'DAY_OF_WEEK_5',),
            (                                  'DAY_OF_WEEK_6',),
            (                                  'DAY_OF_WEEK_7',),
            (    'ACCIDENT_TYPE_Collision with a fixed object',),
            (           'ACCIDENT_TYPE_Collision with vehicle',),
            (   'ACCIDENT_TYPE_Fall from or in moving vehicle',),
            ('ACCIDENT_TYPE_No collision and no object struck',),
            (                   'ACCIDENT_TYPE_Other accident',),
            (                'ACCIDENT_TYPE_Struck Pedestrian',),
            (                    'ACCIDENT_TYPE_Struck animal',),
            ('ACCIDENT_TYPE_Vehicle overturned (no collision)',),
          

In [39]:
#create a new reduced dataframe for unsupervised testing
red_unsupervised = crashes_df[['DAY_OF_WEEK_1', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5',
                                 'DAY_OF_WEEK_6', 'DAY_OF_WEEK_7', 'ACCIDENT_TYPE_Collision with a fixed object',
                                 'ACCIDENT_TYPE_Collision with vehicle', 'ACCIDENT_TYPE_Fall from or in moving vehicle',
                                 'ACCIDENT_TYPE_No collision and no object struck', 'ACCIDENT_TYPE_Other accident',
                                 'ACCIDENT_TYPE_Struck Pedestrian', 'ACCIDENT_TYPE_Struck animal',
                                 'ACCIDENT_TYPE_Vehicle overturned (no collision)',
                                 'ACCIDENT_TYPE_collision with some other object', 'LIGHT_CONDITION_Dark No street lights',
                                 'LIGHT_CONDITION_Dark Street lights off', 'LIGHT_CONDITION_Dark Street lights on',
                                 'LIGHT_CONDITION_Dark Street lights unknown', 'LIGHT_CONDITION_Day',
                                 'LIGHT_CONDITION_Dusk/Dawn', 'LIGHT_CONDITION_Unk.', 'ROAD_GEOMETRY_Cross intersection',
                                 'ROAD_GEOMETRY_Dead end', 'ROAD_GEOMETRY_Multiple intersection',
                                 'ROAD_GEOMETRY_Not at intersection', 'ROAD_GEOMETRY_Private property',
                                 'ROAD_GEOMETRY_Road closure', 'ROAD_GEOMETRY_T intersection', 'ROAD_GEOMETRY_Unknown',
                                 'ROAD_GEOMETRY_Y intersection','SPEED_ZONE_100 km/hr', 'SPEED_ZONE_110 km/hr',
                                 'SPEED_ZONE_40 km/hr', 'SPEED_ZONE_50 km/hr', 'SPEED_ZONE_60 km/hr', 'SPEED_ZONE_70 km/hr',
                                 'SPEED_ZONE_80 km/hr', 'SPEED_ZONE_90 km/hr', 'SPEED_ZONE_Camping grounds or off road',
                                 'SPEED_ZONE_Not known', 'SPEED_ZONE_Other speed limit', 'TOTAL_PERSONS', 'INJ_OR_FATAL',
                                 'MALES', 'FEMALES', 'UNLICENCSED', 'RMA_Arterial Highway', 'RMA_Arterial Other',
                                 'RMA_Freeway', 'RMA_Local Road', 'RMA_Non Arterial', 'RMA_ALL_Arterial Highway',
                                 'RMA_ALL_Arterial Highway,Arterial Other', 'RMA_ALL_Arterial Highway,Local Road',
                                 'RMA_ALL_Arterial Other', 'RMA_ALL_Arterial Other,Arterial Highway',
                                 'RMA_ALL_Arterial Other,Local Road', 'RMA_ALL_Freeway', 'RMA_ALL_Freeway,Arterial Other',
                                 'RMA_ALL_Local Road', 'RMA_ALL_Local Road,Arterial Highway',
                                 'RMA_ALL_Local Road,Arterial Other', 'RMA_ALL_Other', 'SEVERITY_Fatal accident',
                                 'SEVERITY_Non injury accident', 'SEVERITY_Other injury accident',
                                 'SEVERITY_Serious injury accident', 'REGION_NAME_ALL_EASTERN REGION,EASTERN REGION', 
                               'REGION_NAME_ALL_METROPOLITAN NORTH WEST REGION', 
                               'REGION_NAME_ALL_METROPOLITAN SOUTH EAST REGION', 'REGION_NAME_ALL_NORTH EASTERN REGION', 
                               'REGION_NAME_ALL_NORTHERN REGION', 'REGION_NAME_ALL_Other', 
                               'REGION_NAME_ALL_SOUTH WESTERN REGION', 'REGION_NAME_ALL_WESTERN REGION',]]

In [40]:
#define range of clusters
red_k_list = list(range(1, 11))
# Create an empty list to store the inertia values
red_inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list

for i in red_k_list:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(red_unsupervised)
    red_inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [41]:
# Create a dictionary with the data to plot the Elbow curve
red_elbow_data = {"k": red_k_list, "inertia": red_inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_red_elbow = pd.DataFrame(red_elbow_data)

df_red_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=red_k_list
)

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
