In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn import datasets
from sklearn.utils import shuffle

  from numpy.core.umath_tests import inner1d


We will use the 2014 NYC Crime dataset we have used in previous exercises. 

In [2]:
excel_file='Table_8_Offenses_Known_to_Law_Enforcement_by_New_York_by_City_2014.xls'
df=pd.read_excel(excel_file)
df.head()

Unnamed: 0,Table 8,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,NEW YORK,,,,,,,,,,,,
1,Offenses Known to Law Enforcement,,,,,,,,,,,,
2,"by City, 2014",,,,,,,,,,,,
3,City,Population,Violent\ncrime,Murder and\nnonnegligent\nmanslaughter,Rape\n(revised\ndefinition)1,Rape\n(legacy\ndefinition)2,Robbery,Aggravated\nassault,Property\ncrime,Burglary,Larceny-\ntheft,Motor\nvehicle\ntheft,Arson3
4,Adams Village,1851,0,0,,0,0,0,11,1,10,0,0


In [3]:
columns=['City','Population','Violent Crime','Murder/Manslaughter','Rape','Rape Legacy','Robbery','Agg Assault',
           'Property Crime','Burgulary','Larceny','Motor_Vehicle Theft','Arson']

df.columns=columns
df[columns]=df[columns].replace(',', '', regex=True)
df.drop(df.head(4).index, inplace=True)
df_new=df.drop(['Rape', 'Arson'], axis=1)
df_new.head()

Unnamed: 0,City,Population,Violent Crime,Murder/Manslaughter,Rape Legacy,Robbery,Agg Assault,Property Crime,Burgulary,Larceny,Motor_Vehicle Theft
4,Adams Village,1851,0,0,0.0,0,0,11,1,10,0
5,Addison Town and Village,2568,2,0,0.0,1,1,49,1,47,1
6,Afton Village4,820,0,0,,0,0,1,0,1,0
7,Akron Village,2842,1,0,0.0,0,1,17,0,17,0
8,Albany4,98595,802,8,,237,503,3888,683,3083,122


In [4]:
df_new['Population^2']=df_new['Population'].apply(float)**2
df_new['Murder_new']=np.where(df_new['Murder/Manslaughter'] > 0, 1, 0)
df_new['Robbery_new']=np.where(df_new['Robbery'].apply(float) > 0, 1, 0)
df_new.head()

Unnamed: 0,City,Population,Violent Crime,Murder/Manslaughter,Rape Legacy,Robbery,Agg Assault,Property Crime,Burgulary,Larceny,Motor_Vehicle Theft,Population^2,Murder_new,Robbery_new
4,Adams Village,1851,0,0,0.0,0,0,11,1,10,0,3426201.0,0,0
5,Addison Town and Village,2568,2,0,0.0,1,1,49,1,47,1,6594624.0,0,1
6,Afton Village4,820,0,0,,0,0,1,0,1,0,672400.0,0,0
7,Akron Village,2842,1,0,0.0,0,1,17,0,17,0,8076964.0,0,0
8,Albany4,98595,802,8,,237,503,3888,683,3083,122,9720974000.0,1,1


In [5]:
#If a neighborhood has had a Murder and/or Robbery in 2014, we created a new column "Attn" to point this out. 
#53.2% of neighborhoods have had a murder and/or robbery taking place in the past year.
df_new['Attention'] = np.where(df_new['Murder_new']+df_new['Robbery_new'] >=1, 1, 0)
print('Trouble Neighborhoods:', df_new['Attention'].mean())
df_new.tail()

Trouble Neighborhoods: 0.5319148936170213


Unnamed: 0,City,Population,Violent Crime,Murder/Manslaughter,Rape Legacy,Robbery,Agg Assault,Property Crime,Burgulary,Larceny,Motor_Vehicle Theft,Population^2,Murder_new,Robbery_new,Attention
375,3 The FBI does not publish arson data unless i...,,,,,,,,,,,,0,0,0
376,4 This agency began the year submitting rape d...,,,,,,,,,,,,0,0,0
377,5 The FBI determined that the agency's data we...,,,,,,,,,,,,0,0,0
378,,,,,,,,,,,,,0,0,0
379,,,,,,,,,,,,,0,0,0


In [6]:
df_new.replace([np.inf,-np.inf],np.nan)
df_new.dropna(how='any') 

Unnamed: 0,City,Population,Violent Crime,Murder/Manslaughter,Rape Legacy,Robbery,Agg Assault,Property Crime,Burgulary,Larceny,Motor_Vehicle Theft,Population^2,Murder_new,Robbery_new,Attention
4,Adams Village,1851,0,0,0,0,0,11,1,10,0,3.426201e+06,0,0,0
5,Addison Town and Village,2568,2,0,0,1,1,49,1,47,1,6.594624e+06,0,1,1
7,Akron Village,2842,1,0,0,0,1,17,0,17,0,8.076964e+06,0,0,0
15,Amityville Village,9509,12,0,0,11,1,198,10,175,13,9.042108e+07,0,1,1
16,Amsterdam,18099,27,0,0,15,12,490,107,370,13,3.275738e+08,0,1,1
18,Arcade Village,2030,0,0,0,0,0,25,7,18,0,4.120900e+06,0,0,0
19,Ardsley Village,4561,1,0,0,0,1,25,6,19,0,2.080272e+07,0,0,0
20,Asharoken Village,656,0,0,0,0,0,6,0,4,2,4.303360e+05,0,0,0
21,Attica Village,2502,0,0,0,0,0,12,0,12,0,6.260004e+06,0,0,0
31,Black River,1392,0,0,0,0,0,3,1,2,0,1.937664e+06,0,0,0


In [7]:
X = df_new[['Population', 'Agg Assault', 'Rape Legacy', 'Property Crime', 'Murder_new', 'Robbery_new', 'Attention']].copy().dropna()
data = X.Attention
Y = data

In [8]:
Y.shape

(142,)

In [9]:
X.shape

(142, 7)

# Our Neural Network

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Establish and fit with single, 100 perceptron layer 
mlp = MLPClassifier(hidden_layer_sizes=(100,))
mlp.fit(X,Y)
# Cross validate and get scores
cross_val_score(mlp, X, Y, cv=5)

array([0.55172414, 0.44827586, 0.42857143, 0.42857143, 0.57142857])

In [11]:
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X,Y)
# Cross validate and get scores
cross_val_score(mlp, X, Y, cv=5)

array([0.44827586, 0.44827586, 0.42857143, 0.42857143, 0.42857143])

In [12]:
mlp = MLPClassifier(hidden_layer_sizes=(100, 100))
mlp.fit(X,Y)
# Cross validate and get scores
cross_val_score(mlp, X, Y, cv=5)

array([0.44827586, 0.44827586, 0.42857143, 0.42857143, 0.42857143])

In [13]:
mlp = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation='tanh',
    alpha=10, 
    learning_rate='adaptive'
)
mlp.fit(X,Y)

# Cross validate and get scores
cross_val_score(mlp, X, Y, cv=5)

array([0.55172414, 0.5862069 , 0.57142857, 0.57142857, 0.57142857])

In [14]:
mlp = MLPClassifier(
    hidden_layer_sizes=(1000,),
    activation='tanh',
    alpha=10, 
    learning_rate='adaptive'
)
mlp.fit(X,Y)

# Cross validate and get scores
cross_val_score(mlp, X, Y, cv=5)

array([0.5862069 , 0.72413793, 0.57142857, 0.57142857, 0.60714286])

In [15]:
mlp = MLPClassifier(
    hidden_layer_sizes=(100,100),
    activation='tanh',
    alpha=10, 
    learning_rate='adaptive'
)
mlp.fit(X,Y)

# Cross validate and get scores
cross_val_score(mlp, X, Y, cv=5)



array([0.79310345, 0.72413793, 0.71428571, 0.64285714, 0.78571429])

After working with MLP parameters, the double 100 perceptron layers performed the best with an average score of 0.711  although consistency could be improved. We will now compare the neural network's performance with a random forest.


# Random Forest Comparison

In [16]:
forest = ensemble.RandomForestClassifier(
    max_features='sqrt',
)
max_depth = dict(max_depth = np.arange(1,10,1))
grid = GridSearchCV(forest, max_depth, cv=5)
grid.fit(X, Y).best_estimator_.get_params()['max_depth']

1

In [17]:
forest = ensemble.RandomForestClassifier(
    max_features='sqrt',
    max_depth=1
)

cross_val_score(forest, X, Y, cv=5)

array([1.        , 1.        , 1.        , 1.        , 0.96428571])

In [18]:
forest.fit(X,Y)

forest_features = pd.DataFrame()
forest_features['Importance'] = forest.feature_importances_
forest_features['Feature'] = X.columns

forest_features.sort_values('Importance', ascending=False).head(3)
#Two of three most important features in the forest are Attention and Murder_new, newly created features for the challenge

Unnamed: 0,Importance,Feature
6,0.3,Attention
0,0.2,Population
5,0.2,Robbery_new


# Observations

Juxtaposing neural network and random forest, we notice that with the right parameters, they both showed consistency across five folds. However, our RF was clearly overfitting, whereas our neural network was not. We were able to ascertain important features that were considered for RF, but not for NN. The two models varied in speed but that was negligible in this case. 