In [2]:
#dependencies and load data
import pandas as pd
import hvplot.pandas
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import tree
import pickle

#load in the cleaned data
crashes_df = pd.read_csv("..\Resources\crashes_cleaned_df.csv")
crashes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56414 entries, 0 to 56413
Columns: 159 entries, Unnamed: 0 to STAT_DIV_NAME_Metro
dtypes: float64(7), int64(151), object(1)
memory usage: 68.4+ MB


In [3]:
#load in the original data and drop unecessary columns and nan values
original_data = pd.read_csv("../Resources/Road_Crashes_for_five_Years_Victoria.csv")
original_data = original_data.drop(columns=["X", "Y", "ACCIDENT_NO", "SRNS", "SRNS_ALL", "DIVIDED", "DIVIDED_ALL", "UNKNOWN", "NODE_ID", "OBJECTID"])
original_data = original_data.dropna()

In [4]:
#print region name value counts, and discover there is an empty one
original_data['REGION_NAME'].value_counts()

REGION_NAME
METROPOLITAN SOUTH EAST REGION    20163
METROPOLITAN NORTH WEST REGION    19779
SOUTH WESTERN REGION               4333
EASTERN REGION                     3128
NORTHERN REGION                    3055
NORTH EASTERN REGION               3042
WESTERN REGION                     2914
                                      1
Name: count, dtype: int64

In [5]:
#dislpay the empty region row
original_data.loc[original_data['REGION_NAME'] == " "]

Unnamed: 0,ABS_CODE,ACCIDENT_STATUS,ACCIDENT_DATE,ACCIDENT_TIME,ALCOHOLTIME,ACCIDENT_TYPE,DAY_OF_WEEK,DCA_CODE,HIT_RUN_FLAG,LIGHT_CONDITION,...,PASSENGERVEHICLE,MOTORCYCLE,PUBLICVEHICLE,DEG_URBAN_NAME,DEG_URBAN_ALL,LGA_NAME_ALL,REGION_NAME_ALL,RMA,RMA_ALL,STAT_DIV_NAME
25344,ABS to receive accident,Finished,22/06/2017,19:30:00+00,Yes,Collision with vehicle,5,REAR END(VEHICLES IN SAME LANE),No,Dark Street lights on,...,2.0,0.0,0.0,MELB_URBAN,MELB_URBAN,MORELAND,METROPOLITAN NORTH WEST REGION,Arterial Other,"Arterial Other,Local Road",Metro


In [6]:
#remove the empty region row
original_data = original_data.drop(original_data.loc[original_data['REGION_NAME'] == " "].index)

In [7]:
#define reduced dataframe with columns we will have user input for
model_data_df = crashes_df[['DAY_OF_WEEK_1', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5',
                                 'DAY_OF_WEEK_6', 'DAY_OF_WEEK_7', 'ACCIDENT_TYPE_Collision with a fixed object',
                                 'ACCIDENT_TYPE_Collision with vehicle', 'ACCIDENT_TYPE_Fall from or in moving vehicle',
                                 'ACCIDENT_TYPE_No collision and no object struck', 'ACCIDENT_TYPE_Other accident',
                                 'ACCIDENT_TYPE_Struck Pedestrian', 'ACCIDENT_TYPE_Struck animal',
                                 'ACCIDENT_TYPE_Vehicle overturned (no collision)',
                                 'ACCIDENT_TYPE_collision with some other object', 'LIGHT_CONDITION_Dark No street lights',
                                 'LIGHT_CONDITION_Dark Street lights off', 'LIGHT_CONDITION_Dark Street lights on',
                                 'LIGHT_CONDITION_Dark Street lights unknown', 'LIGHT_CONDITION_Day',
                                 'LIGHT_CONDITION_Dusk/Dawn', 'LIGHT_CONDITION_Unk.', 'ROAD_GEOMETRY_Cross intersection',
                                 'ROAD_GEOMETRY_Dead end', 'ROAD_GEOMETRY_Multiple intersection',
                                 'ROAD_GEOMETRY_Not at intersection', 'ROAD_GEOMETRY_Private property',
                                 'ROAD_GEOMETRY_Road closure', 'ROAD_GEOMETRY_T intersection', 'ROAD_GEOMETRY_Unknown',
                                 'ROAD_GEOMETRY_Y intersection','SPEED_ZONE_100 km/hr', 'SPEED_ZONE_110 km/hr',
                                 'SPEED_ZONE_40 km/hr', 'SPEED_ZONE_50 km/hr', 'SPEED_ZONE_60 km/hr', 'SPEED_ZONE_70 km/hr',
                                 'SPEED_ZONE_80 km/hr', 'SPEED_ZONE_90 km/hr', 'SPEED_ZONE_Camping grounds or off road',
                                 'SPEED_ZONE_Not known', 'SPEED_ZONE_Other speed limit', 'RMA_ALL_Arterial Highway',
                                 'RMA_ALL_Arterial Highway,Arterial Other', 'RMA_ALL_Arterial Highway,Local Road',
                                 'RMA_ALL_Arterial Other', 'RMA_ALL_Arterial Other,Arterial Highway',
                                 'RMA_ALL_Arterial Other,Local Road', 'RMA_ALL_Freeway', 'RMA_ALL_Freeway,Arterial Other',
                                 'RMA_ALL_Local Road', 'RMA_ALL_Local Road,Arterial Highway',
                                 'RMA_ALL_Local Road,Arterial Other', 'RMA_ALL_Other', 'SEVERITY_Fatal accident',
                                 'SEVERITY_Non injury accident', 'SEVERITY_Other injury accident',
                                 'SEVERITY_Serious injury accident', 'REGION_NAME_EASTERN REGION',
                                 'REGION_NAME_METROPOLITAN NORTH WEST REGION', 'REGION_NAME_METROPOLITAN SOUTH EAST REGION',
                                 'REGION_NAME_NORTH EASTERN REGION', 'REGION_NAME_NORTHERN REGION', 
                                 'REGION_NAME_SOUTH WESTERN REGION', 'REGION_NAME_WESTERN REGION']]
model_data_df.head()

Unnamed: 0,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,ACCIDENT_TYPE_Collision with a fixed object,ACCIDENT_TYPE_Collision with vehicle,ACCIDENT_TYPE_Fall from or in moving vehicle,...,SEVERITY_Non injury accident,SEVERITY_Other injury accident,SEVERITY_Serious injury accident,REGION_NAME_EASTERN REGION,REGION_NAME_METROPOLITAN NORTH WEST REGION,REGION_NAME_METROPOLITAN SOUTH EAST REGION,REGION_NAME_NORTH EASTERN REGION,REGION_NAME_NORTHERN REGION,REGION_NAME_SOUTH WESTERN REGION,REGION_NAME_WESTERN REGION
0,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,0,0,0,0,1,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0


In [9]:
#define the scaler seperately as we want to save it to use again
scaler = StandardScaler().fit(model_data_df)

#scale the data
X_scaled = scaler.transform(model_data_df)

In [10]:
#define our y data
y = original_data['DEG_URBAN_NAME']

In [11]:
#split the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=78)

In [12]:
#define tree model, fit the data to it and make predictions
treem = tree.DecisionTreeClassifier()
treem = treem.fit(X_train, y_train.ravel())
treem_predictions = treem.predict(X_test)

In [13]:
#print the classification report
print(classification_report(y_test, treem_predictions))

                         precision    recall  f1-score   support

LARGE_PROVINCIAL_CITIES       0.58      0.68      0.63       796
          MELBOURNE_CBD       0.31      0.25      0.28       173
             MELB_URBAN       0.92      0.96      0.94      8755
         RURAL_VICTORIA       0.83      0.79      0.81      3022
           SMALL_CITIES       0.47      0.37      0.42       790
            SMALL_TOWNS       0.05      0.03      0.04       137
                  TOWNS       0.24      0.18      0.20       431

               accuracy                           0.83     14104
              macro avg       0.49      0.46      0.47     14104
           weighted avg       0.82      0.83      0.82     14104



In [14]:
#save the model
pickle.dump(treem, open('tree_model.sav', 'wb'))

In [15]:
#save the scaler
with open('scaler.pkl','wb') as f:
    pickle.dump(scaler, f)

In [16]:
#have a look at what the predictions look like
treem_predictions

array(['MELB_URBAN', 'RURAL_VICTORIA', 'MELB_URBAN', ..., 'MELB_URBAN',
       'MELB_URBAN', 'RURAL_VICTORIA'], dtype=object)