In [1]:
#!pip install prince

#Imports

In [2]:
#Imports
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  

import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.manifold import MDS

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


#Loading Data

ModuleNotFoundError: No module named 'google.colab'

In [None]:
#Importing data
data = pd.read_csv('monroe-county-crash-data2003-to-2015.csv',encoding='latin1')

In [None]:
#peaking at data
print(data.head())

In [None]:
#saving original version of data
original_data = data.copy()

#Preprocessing

###Dropping unnecessary columns and Rows with Null Values

In [None]:
#======================PREPROCESSING==================================

#1 drop Master Record Number colum. it serves no purpose in this case
data.drop('Master Record Number',axis = 1,inplace=True)
data.drop('Reported_Location',axis = 1,inplace=True)

data.columns

In [None]:
#3 dropping records with blank hours
data = data.dropna(subset=['Hour'])

In [None]:
#4 droping invalid latitude and longitude

print(len(data[(data.Latitude == 0) 
	|(data.Latitude == 1) 
	| (data.Longitude == 0)
	|(data.Longitude == 1)
	|(data.Longitude.isna()
	|(data.Latitude.isna()))]))

#print(len(data))

data = data.drop(data[(data.Latitude == 0) 
	|(data.Latitude == 1) 
	| (data.Longitude == 0)
	|(data.Longitude == 1)
	|(data.Longitude.isna()
	|(data.Latitude.isna()))].index)

###Replacing Invalid Values with Correct Values

In [None]:
data['Latitude'][data['Latitude']<=9.133739] = 39.133739
data['Latitude'][data['Latitude']<=9.183292] = 39.183292

#Exploring Data

##Plotting

In [None]:
print(data)

###Dimensionality reduction

In [None]:
from sklearn import preprocessing
import prince

data_to_reduce = data

In [None]:
#Droping null values 
data_to_reduce.dropna(inplace= True)

In [None]:
#MCA multiple correspondence analysis 
mca = prince.FAMD(n_components = 9,
                 n_iter = 5,
                 copy = True,
                 check_input = True,
                 engine = 'auto',
                 random_state = 42)
mca.fit(data_to_reduce)

In [None]:
mca_components = mca.row_coordinates(data_to_reduce)
print(mca_components.head())

In [None]:
#saving reference to reduced data 
reduced_data = mca_components

In [None]:
#contribution of each component to data
comp_percentages = pd.DataFrame(sorted(mca.explained_inertia_,reverse = True))
print(comp_percentages)

In [None]:
#plotting of components 
comp_percentages.plot(kind = 'bar')

######Initial Plotting

Since the firt component accounts for most of the data, we try an initial 1-D plot with just the first component. Already it hints at four clusters. The only class with four distint categories is the injury type class. This might be a clue as of what these groupings mean.

In [None]:
sns.scatterplot(mca_components[0],1)

######2D Plot

Again we try to plot in 2-D to see if there are any patterns embedded in the data. 

In [None]:
#initial plotting
x,y = mca_components[0],mca_components[1]
sns.scatterplot(x,y)

There seems to be more clusters. Let's test our hypothesis on if they have a correlation with the Injury Type categories. 

In [None]:
injury_types = data_to_reduce['Injury Type'].unique()
print(injury_types)

In [None]:
#testing clusters in plot
x1,y1 = x[data_to_reduce['Injury Type'] == injury_types[0]],y[data_to_reduce['Injury Type'] == injury_types[0]]
x2,y2 = x[data_to_reduce['Injury Type'] == injury_types[1]],y[data_to_reduce['Injury Type'] == injury_types[1]]
x3,y3 = x[data_to_reduce['Injury Type'] == injury_types[2]],y[data_to_reduce['Injury Type'] == injury_types[2]]
x4,y4 = x[data_to_reduce['Injury Type'] == injury_types[3]],y[data_to_reduce['Injury Type'] == injury_types[3]]

plt.scatter(x1,y1,color = 'blue')
plt.scatter(x2,y2,color = 'red')
plt.scatter(x3,y3,color ='green')
plt.scatter(x4,y4,color ='black')



######Clusters are more or less uniformed. Green and black are part of the same clusters, which can mean there should be three classes instead of 4. But further analysis is needed. 



####3D plot

In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
#defining z
z = mca_components[2]

z1,z2,z3,z4 = z[data_to_reduce['Injury Type'] == injury_types[0]],z[data_to_reduce['Injury Type'] == injury_types[1]],z[data_to_reduce['Injury Type'] == injury_types[2]],z[data_to_reduce['Injury Type'] == injury_types[3]]


#####Different angles 

In [None]:

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(x1,y1,z1,c='blue')
ax.scatter(x2,y2,z2,c='red')
ax.scatter(x3,y3,z3,c='green')
ax.scatter(x4,y4,z4,c='black')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(z1,y1,x1,c='blue')
ax.scatter(z2,y2,x2,c='red')
ax.scatter(z3,y3,x3,c='green')
ax.scatter(z4,y4,x4,c='black')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(y1,x1,z1,c='blue')
ax.scatter(y2,x2,z2,c='red')
ax.scatter(y3,x3,z3,c='green')
ax.scatter(y4,x4,z4,c='black')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(y1,z1,x1,c='blue')
ax.scatter(y2,z2,x2,c='red')
ax.scatter(y3,z3,x3,c='green')
ax.scatter(y4,z4,x4,c='black')

######conclusion

We can see that at different angles we can seperate 3 different clusters into respective layers: Blue, Red, and Green+Black. In all the angles, green and black seem to be of the same cluster. Since the color code is based on the Injury Types, we can conclude that there are supposed to be three classes instead of four. 

###Over Sampling 

Let's check the distribution of data to see if what were infereing matches with what's actually there. 

In [None]:
#In order to plot the data we have to do dimensionality reduction 
#from sklearn import preprocessing
#import prince

#encoder = preprocessing.LabelEncoder()

#data_to_reduce = data
#data_to_reduce['Collision Type'] = encoder.fit_transform(original_data['Collision Type'].astype('str'))

#data_to_reduce['Primary Factor'] = encoder.fit_transform(original_data['Primary Factor'].astype('str'))

#data_to_reduce['Weekend?'] = encoder.fit_transform(original_data['Weekend?'].astype('str'))

#data_to_reduce.dropna(inplace = True)

In [None]:
#changing data type of text types 
#for i in ['Weekend?','Collision Type','Injury Type','Primary Factor']:
  #data_to_reduce[i] = data_to_reduce[i].astype('str')

In [None]:
####CHECK FOR OVER SAMPLING POSSIBILITY USING SMOTE
#Distribution of target class
from collections import Counter

dist = pd.DataFrame(Counter(data_to_reduce['Injury Type']).items())
dist = dist.set_index(dist[0]).drop(0,axis = 'columns')


In [None]:
# dist plot *
plot = dist[[1]].plot(kind="bar");
plot.set_title("Injury Type Distribution", fontsize=20);
plot.grid(color='lightgray', alpha=0.5);

We can see that the representation of each Injury Type in the data is no where near even. We have to use some sort of sampling technique. 

In [None]:
#over sampling 
from imblearn.over_sampling import SMOTENC # uses KNN to generate new samples
from collections import Counter

features,labels = data.drop('Injury Type',axis = 'columns'), data['Injury Type']

#Synthetic Minority Over-Sampling Technique
#X_resampled, y_resampled = SMOTENC(categorical_features=[0,1,2,3,4,5,6,7]).fit_resample(features,labels)

In [None]:
#cols = data.drop('Injury Type',axis = 1).columns
#cols = np.append(cols,'Injury Type')

In [None]:
#res_data = pd.DataFrame(np.concatenate((X_resampled,y_resampled[:,np.newaxis]),axis = 1),columns = cols)
#res_data

######Saving to csv file*

In [None]:
#Saving res_data to csv so we don't have to regenerate it. 
#res_data.to_csv(r'drive/My Drive/Colab Notebooks/Data-Mining-Project/res_data.csv',index=False)

#####Loading data from saved csv file

In [None]:
res_data = pd.read_csv('res_data.csv')

In [None]:
# new distribution of target class

new_dist = Counter(res_data['Injury Type']).items() #shows that there are equal amounts in each class
new_dist = pd.DataFrame(new_dist,index=['No Injury/Unknown','Non-incapacitating','Incapcitatin','Fatal'])
print(new_dist)

In [None]:
# new dist plot 
plot = new_dist[[1]].plot(kind="bar");
plot.set_title("Injury Type Distribution", fontsize=20);
plot.grid(color='lightgray', alpha=0.5);

###Reploting with resampled data

In [None]:
res_data.dropna(inplace=True)

In [None]:
res_reduced_data = mca.fit_transform(res_data)

In [None]:
#new percetages with resampled data
res_comp_percentages = pd.DataFrame(mca.explained_inertia_)

In [None]:
# new percentages plot with resampled data
pd.DataFrame(res_comp_percentages).plot(kind = 'bar')

####2D

In [None]:
#testing clusters in plot

x,y,z = res_reduced_data[0],res_reduced_data[1],res_reduced_data[2]
x1,y1 = x[res_data['Injury Type'] == injury_types[0]],y[res_data['Injury Type'] == injury_types[0]]
x2,y2 = x[res_data['Injury Type'] == injury_types[1]],y[res_data['Injury Type'] == injury_types[1]]
x3,y3 = x[res_data['Injury Type'] == injury_types[2]],y[res_data['Injury Type'] == injury_types[2]]
x4,y4 = x[res_data['Injury Type'] == injury_types[3]],y[res_data['Injury Type'] == injury_types[3]]

plt.scatter(x1,y1,color = 'blue')
plt.scatter(x2,y2,color = 'red')
plt.scatter(x3,y3,color ='green')
plt.scatter(x4,y4,color ='black')


####3D

In [None]:
z1,z2,z3,z4 = z[res_data['Injury Type'] == injury_types[0]],z[res_data['Injury Type'] == injury_types[1]],z[res_data['Injury Type'] == injury_types[2]],z[res_data['Injury Type'] == injury_types[3]]


In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(x1,y1,z1,c='blue')
ax.scatter(x2,y2,z2,c='red')
ax.scatter(x3,y3,z3,c='green')
ax.scatter(x4,y4,z4,c='black')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(z1,y1,x1,c='blue')
ax.scatter(z2,y2,x2,c='red')
ax.scatter(z3,y3,x3,c='green')
ax.scatter(z4,y4,x4,c='black')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(y1,x1,z1,c='blue')
ax.scatter(y2,x2,z2,c='red')
ax.scatter(y3,x3,z3,c='green')
ax.scatter(y4,x4,z4,c='black')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(y1,z1,x1,c='blue')
ax.scatter(y2,z2,x2,c='red')
ax.scatter(y3,z3,x3,c='green')
ax.scatter(y4,z4,x4,c='black')

It seems like the data is not as easy to seperate anymore, at least not by Injury types. However, we see that now it creates two distinct clusters. Let's explore the data further by running K-means on both the data before over-sampling and after over-sampling. 

###K-Means



In [None]:
#CLUSTERSING=================Components========================
from sklearn.cluster import KMeans
%config InlineBackend.figure_format='retina'

#####elbow plot function

In [None]:
#======================PCA ELBOW DEF==================
def elbow_plot(data,num_components,num_k = range(1,10)):
  k_num = num_k
  inertias = []
  for k in k_num:
      # Create a KMeans instance with k clusters: model
      model = KMeans(n_clusters=k)
    
      # Fit model to samples
      model.fit(data.iloc[:,:num_components])
    
      # Append the inertia to the list of inertias
      inertias.append(model.inertia_)
    
  plt.plot(k_num, inertias, '-o', color='black')
  plt.xlabel('number of clusters, k')
  plt.ylabel('inertia')
  plt.xticks(k_num)
  plt.show()

####Elbow for both plots

##### using 2 components

In [None]:
#elbow plot for for regular data
elbow_plot(reduced_data,2)

In [None]:
#elbow plot for resampled data
elbow_plot(res_reduced_data,2)

Here, we confirm what we visually inspected, that the resampled data clearly defines two clusters as being the ideal choice and the regular data is right between three or four. 

However, one thing to consider is how much of the data is being represented by the number of components in each case. 

In [None]:
print(comp_percentages[:2].sum())
print(res_comp_percentages[:2].sum())

We see that with the first two components, the data before over sampling holds close to 60% of the data, while the resampled data only holds about 48%. What if we chose the number of components in a way that made these percentages closer to being equal.

In [None]:
print(comp_percentages[:2].sum())
print(res_comp_percentages[:3].sum())

Here We see that when they are a lot closer when the resampled data uses 3 components while the original uses 2. 

#####using 3 components 

In [None]:
#elbow plot for for regular data
elbow_plot(reduced_data,2)

In [None]:
#elbow plot for resampled data
elbow_plot(res_reduced_data,3)

In [None]:
#resampled data with 4 components
elbow_plot(res_reduced_data,4)

Here, we see something that is more consistent with our 3D plots. The data before over-sampling can separate about 3 individual clusters. However, the resampled data while it shows 2 distinct clusters at some angles, it starts looking a lot more jumbled up when the image is rotated, not to mention that the clusters provided by the resampled data are not uniformed at all. 

This can be due to the fact that when using over-sampling techniques, it is import to make sure that the new synthesized data examples are realistic. In the case of accident reports, it is very hard to explain by what means one would determine whether or not a specific combinations of features would be realistic or not. 

It can also be said that the data before over-sampling is trying to find patterns in the data that don't necessarily exist because it doesn't have enough information to accurately learn from the data. In this case, it could be that the resampled data found the right amount of clusters and they aren't necessarily related to the Injury Type. 

Let's try some of our hypotheses by plotting the clusters produced by KMeans and the clusters we saw by Injury Type.

####Getting Clusters for data before over-sampling

using the first two components 

In [None]:
km_model = KMeans(n_clusters=3)

predicted = km_model.fit_predict(reduced_data.iloc[:,:2])

In [None]:
#adding cluster assigment to data frame 
reduced_data['Cluster'] = predicted

In [None]:
#getting references to each cluster
c1 = reduced_data[reduced_data['Cluster']==0]
c2 = reduced_data[reduced_data['Cluster']==1]
c3 = reduced_data[reduced_data['Cluster']==2]

In [None]:
#plotting clusters
plt.scatter(c1[0],c1[1],color='blue')
plt.scatter(c2[0],c2[1],color='red')
plt.scatter(c3[0],c3[1],color='green')

In [None]:
#getting reference to data by Injury Types combining incapaciting and fatal as we saw in the visual
i1 = reduced_data[data_to_reduce['Injury Type']==injury_types[0]]
i2 = reduced_data[data_to_reduce['Injury Type']==injury_types[1]]
i3 = reduced_data[(data_to_reduce['Injury Type']==injury_types[2])|(data_to_reduce['Injury Type']==injury_types[3])]

In [None]:
#plotting by Injury Type
plt.scatter(i1[0],i1[1],color='blue')
plt.scatter(i2[0],i2[1],color='red')
plt.scatter(i3[0],i3[1],color='green')

The clusters don't really match. This is a sign that K-means is not clustering bases on Injury Type but maybe by some other attribute hidden in the data. 

####Clusters resampled data

Using the first 3 components and 4 clusters as given by the elbow 

In [None]:
km_model = KMeans(n_clusters=4)

res_predicted = km_model.fit_predict(res_reduced_data.iloc[:,:3])

In [None]:
#adding cluster assigment to data frame 
res_reduced_data['Cluster'] = res_predicted

In [None]:
#getting references to each cluster
rc1 = res_reduced_data[res_reduced_data['Cluster']==0]
rc2 = res_reduced_data[res_reduced_data['Cluster']==1]
rc3 = res_reduced_data[res_reduced_data['Cluster']==2]
rc4 = res_reduced_data[res_reduced_data['Cluster']==3]

In [None]:
#plotting clusters
plt.scatter(rc1[0],rc1[1],color='blue')
plt.scatter(rc2[0],rc2[1],color='red')
plt.scatter(rc3[0],rc3[1],color='green')
plt.scatter(rc4[0],rc4[1],color='black')

In [None]:
#getting reference to data by Injury Types combining incapaciting and fatal as we saw in the visual
ri1 = res_reduced_data[res_data['Injury Type']==injury_types[0]]
ri2 = res_reduced_data[res_data['Injury Type']==injury_types[1]]
ri3 = res_reduced_data[(res_data['Injury Type']==injury_types[2])]
ri4 = res_reduced_data[(res_data['Injury Type']==injury_types[3])]

In [None]:
#plotting by Injury Type
plt.scatter(ri1[0],ri1[1],color='blue')
plt.scatter(ri2[0],ri2[1],color='red')
plt.scatter(ri3[0],ri3[1],color='green')
plt.scatter(ri3[0],ri3[1],color='black')

Once again, these clusters don't seem to have a direct relationship with Injury type attribute. 

We can assume that the KMeans algorithms is not using the same metrics to measure closeness or similarity between the examples as we our idea of space that we infer from visually inspecting the data. It could also be that some of the features from the original data don't make sense to use in trying to uncover any significant pattern in the data. We need to see exactly how these features are correlated to each other. 

If KMeans is finding some hidden patterns in the data, even if we don't know what it is, we can try to use some predictive models to see if the original features can be used to predict the clusters that were found by KMeans. 

###Descriptive Statistics

In [None]:
# Histograms
for i in data[['Year','Month','Day','Hour']].columns:
  sns.distplot(data[i])
  plt.title(i)
  plt.show()

from this we see that most of the time based features, except for time are evenly distributed in the data. So we can assume that they are not really going to have any predictive value for other attributes in the data. 

We'll also remove any location information from the data since we are trying to predict the injury type. 

But first let's see how these features are correlated with each other. 

Firts we have to transform our data

###Encoding Categorical Values

We'll use the original data first 

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
c_data = data.copy()

In [None]:
for i in c_data.drop(['Latitude','Longitude','Month','Day'],axis='columns').columns:
  c_data[i] = le.fit_transform(c_data[i])

In [None]:
print(c_data)

In [None]:
print(c_data.corr())

In [None]:
#HEATMAP
cor = c_data.corr()

plt.figure(figsize = (7,7))
plt.rcParams.update({'font.size': 6})

ax = sns.heatmap(
    cor,
    annot = True,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
plt.show()

Since the correlations are low, we'll use a threshold of .1 to select which features are correlated. Surprisingly year is correlated with Injury type so we'll trust the data and keep it, along with Collision type and Primary factor with the data. 

####chi-squared function


In [None]:
from scipy.stats import chi2_contingency

def chi_squared(df, c1, c2):
    groupsizes = df.groupby([c1, c2]).size()
    ctsum = groupsizes.unstack(c1)
    # fillna(0) is necessary to remove any NAs which will cause exceptions
    print(c1,' and ',c2)
    return(chi2_contingency(ctsum.fillna(0))[:3])# returns chi-squared, P-value, and degree of freedom



####chi squared tests

In [None]:
for i in data.drop(['Injury Type','Latitude','Longitude'],axis='columns').columns:
    print(chi_squared(data,'Injury Type',i))

We see that those chi-squared tests among those variables are higher as well. So we will choose the three variables Year, Collision Type, and Primary Factory for our features, using Injury Type as the target variable. 

####Feature Selection 

Let's use SKLearn built-in feature selection to try to see if the analysis matches ours. 

###Transforming Data for Training

In [None]:
#Checking distinct primary factors
primary_factors = data['Primary Factor'].unique()
primary_factors

In [None]:
print(len(primary_factors))

We'll one-hot encode primary factors since the amount of disctint values seem to repeat in the data.

We'll also one-hot encode the collision type.

We'll label encode the injury type by level of severity, since we might want to use it as a target variable.




In [None]:
#one-hot encoding primary factors 
enc_primary_factor = pd.get_dummies(data['Primary Factor'],drop_first = True)
#one-hot encoding collision types 
enc_collision_type = pd.get_dummies(data['Collision Type']).drop('Bus',axis = 'columns')
#concatinating the two 
transformed_data = pd.concat([enc_collision_type,enc_primary_factor],axis = 'columns')

#merge Injury type as target variable
transformed_data = pd.concat([data['Year'],transformed_data,data['Injury Type']],axis = 'columns')

#c Injury Type Label Encoding in order of severity 
transformed_data['Injury Type'] = transformed_data['Injury Type'].map({'No injury/unknown':0,'Non-incapacitating':1,'Incapacitating':2,'Fatal':3})

#d making injury types numeric
#transformed_data['Injury Type']=pd.to_numeric(ndf['Injury Type'])

#engineering new feature that turns four class target class into a two class target
#ndf['Severe Injury'] = ndf['Injury Type'].map({0:0,1:0,2:1,3:1})

In [None]:
print(transformed_data)

now we can move on to some prediction 

#Prediction

##Model Selection

In [None]:
def try_model(model, X, y):
  #splitting train and test data
  X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2) 

  # Training  KNN Model
  m = model
  m.fit(X_train, y_train)
 
  print ("Cross_Validation_Mean_Score:""\n",cross_val_score(m,X_train,y_train,cv=5).mean())
  print()
  print (" Accuracy:" "\n",metrics.accuracy_score(y_test, m.predict(X_test)))
  print()
  print("Confusion Matrix:" "\n",metrics.confusion_matrix(y_test, m.predict(X_test)))
  print()
  print("Classification_Report:" "\n", metrics.classification_report(y_test, m.predict(X_test))) 
  print()


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression

In [None]:
rf_model = RandomForestClassifier(n_estimators=10)
knn_model = KNeighborsClassifier()
log_model = LogisticRegression()

###Partitioning data

In [None]:
#partitioning data
X,y = transformed_data.drop('Injury Type',axis = 'columns'),transformed_data['Injury Type']

###Cross Val Scores 

Using transfomed data 

In [None]:
#Random Forest
try_model(rf_model,X,y)

In [None]:
#KNN
try_model(knn_model,X,y)

In [None]:
#Logistic Regression
try_model(log_model,X,y)

Let's see if the reduced data can be used to determine the injury type. But we have to reduce it without the target variable. And we'll use the resampled data since it is more evenly spread out for the target class. 

In [None]:
#reducing data
reduced_train_data = pd.DataFrame(mca.fit_transform(res_data.drop('Injury Type',axis = 1)))

In [None]:
#percentage of data representedd
pd.DataFrame(mca.explained_inertia_[:8]).sum()

We see that the first 8 components hold about 85 percent of the data so we'll use those. 

In [None]:
#new X and y
X2 = reduced_train_data.iloc[:,:8]
y2 = res_data['Injury Type']

In [None]:
#Random Forest
try_model(rf_model,X2,y2)

In [None]:
#KNN
try_model(knn_model,X2,y2)

In [None]:
#Logistic Regression 
try_model(log_model,X2,y2)

Using the reduced data, we see improvement in KNN and Random Forest but Logistic Regression's performance dropped significantly. 

We see that we can get some impovements in our models by using the reduced data rather than the selected features from the original data. But we can still try something else to improve the usefulness of our models. 

As of now the models are trying to predict four different classes of Injury Type, which are No Injury/Unknown, Non-Incapacitating, Incapacitating, and Fatal. We can observe the this attribute can be summerized into two classes, Non-Severe and Severe. 

We create a new attribute out of the Injury Type that only has a binary value and we'll use that as the new target value. 

In [None]:
transformed_data['Severe'] = transformed_data['Injury Type'].map({0:0,1:0,2:1,3:1})

In [None]:
#defining new X and y
X3,y3 = transformed_data.drop(['Injury Type','Severe'],axis = 1), transformed_data['Severe']

now we try our models using severe as the target variable

In [None]:
#Random Forest
try_model(rf_model,X3,y3)

In [None]:
#KNN
try_model(knn_model,X3,y3)

In [None]:
#Logistic Regression
try_model(log_model,X3,y3)

We see now that our models are able to predict the severity of the injury based on the selected features. Now we'll try it with the resampled reduced data to see if we can get even more improvement. 
X2 is the first 8 components of our resampled data. 

In [None]:
#mapping to reduced data. 
y4 = res_data['Injury Type'].map({'No injury/unknown':0,'Non-incapacitating':0,'Incapacitating':1,'Fatal':1})


In [None]:
#Random Forest
try_model(rf_model,X2,y4)

In [None]:
#KNN 
try_model(knn_model,X2,y4)

In [None]:
#Logistic Regression
try_model(log_model,X2,y4)

Once again we observe that logistic regression didn't perform as well as the other models on the reduced data. However, even the performance seems to have dropped when using the reduced data, the results can be trusted a little more because the reduced data came from the over-sampled data, which has a more even distribution for the target class. 

Let's try a neural network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
nn_model = MLPClassifier(activation='logistic', batch_size='auto',
                         early_stopping=True,
                         epsilon=1e-08, hidden_layer_sizes=(8,8,4,2),
                         learning_rate_init=0.001,
                         learning_rate = 'adaptive',
                         max_iter=200,random_state=2)

In [None]:
try_model(nn_model,X,y3)

In [None]:
try_model(nn_model,X2,y4)

Here we see similar results for the selected features and the new target variable but the neural network, like logistic regression, performs worse when the components are used as features. 

#Final Plotting

Since our selected features seem to be able to predict the injury class with decent accuracy, we'll use them to plot, once again, and get a visual on the transformed data. 

In [None]:
new_data = data[['Year','Collision Type','Primary Factor','Injury Type']]

In [None]:
mca2 = prince.MCA(n_components = 3,
                 n_iter = 5,
                 copy = True,
                 check_input = True,
                 engine = 'auto',
                 random_state = 42)

In [None]:
new_data_components = mca2.fit_transform(transformed_data)

###2D

In [None]:
sns.scatterplot(new_data_components[0],new_data_components[1])

In [None]:
x,y = new_data_components[0],new_data_components[1]

Color Coding

In [None]:
#testing clusters in plot
x1,y1 = x[data_to_reduce['Injury Type'] == injury_types[0]],y[data_to_reduce['Injury Type'] == injury_types[0]]
x2,y2 = x[data_to_reduce['Injury Type'] == injury_types[1]],y[data_to_reduce['Injury Type'] == injury_types[1]]
x3,y3 = x[data_to_reduce['Injury Type'] == injury_types[2]],y[data_to_reduce['Injury Type'] == injury_types[2]]
x4,y4 = x[data_to_reduce['Injury Type'] == injury_types[3]],y[data_to_reduce['Injury Type'] == injury_types[3]]

plt.scatter(x1,y1,color = 'blue')
plt.scatter(x2,y2,color = 'red')
plt.scatter(x3,y3,color ='green')
plt.scatter(x4,y4,color ='black')

We see that the data is being seperated in what we defined as Severe and Non-Severe

In [None]:
x1,y1 = x[data_to_reduce['Injury Type'] == injury_types[0]],y[data_to_reduce['Injury Type'] == injury_types[0]]
x2,y2 = x[data_to_reduce['Injury Type'] == injury_types[1]],y[data_to_reduce['Injury Type'] == injury_types[1]]
x3,y3 = x[data_to_reduce['Injury Type'] == injury_types[2]],y[data_to_reduce['Injury Type'] == injury_types[2]]
x4,y4 = x[data_to_reduce['Injury Type'] == injury_types[3]],y[data_to_reduce['Injury Type'] == injury_types[3]]

plt.scatter(x1,y1,color = 'blue')
plt.scatter(x2,y2,color = 'blue')
plt.scatter(x3,y3,color ='green')
plt.scatter(x4,y4,color ='green')

###3D

In [None]:
#defining z
z = new_data_components[2]

z1,z2,z3,z4 = z[data_to_reduce['Injury Type'] == injury_types[0]],z[data_to_reduce['Injury Type'] == injury_types[1]],z[data_to_reduce['Injury Type'] == injury_types[2]],z[data_to_reduce['Injury Type'] == injury_types[3]]


In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(x1,y1,z1,c='blue')
ax.scatter(x2,y2,z2,c='red')
ax.scatter(x3,y3,z3,c='green')
ax.scatter(x4,y4,z4,c='black') 

Now with the two clusters color coded by the severe attribute. 

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(x1,y1,z1,c='blue')
ax.scatter(x2,y2,z2,c='blue')
ax.scatter(x3,y3,z3,c='green')
ax.scatter(x4,y4,z4,c='green')