In [166]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn import metrics
%matplotlib inline
pd.options.display.float_format='{:.3f}'.format

import warnings
warnings.filterwarnings(action='ignore', module='scipy', message='^internal gelsd')

In [184]:
#Prepare and clean this data to model according to this formula:Property crime = \alpha + Population + Population-squared + Murder + Robbery

df=pd.read_excel('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls', header=4)
df_2014=pd.read_excel('Table_8_Offenses_Known_to_Law_Enforcement_by_New_York_by_City_2014.xls', header=4)

## Preparing Dataset, Model

In [185]:
#calculate the median, std and the limit for outliers
#turn outliers into None
median=df.Population.median()
std_dev=df.Population.std()
pop_outliers=median+2*std_dev
df['Population']=df.Population.map(lambda x: x if x < pop_outliers else None)

median=df['Murder and\nnonnegligent\nmanslaughter'].median()
std_dev=df['Murder and\nnonnegligent\nmanslaughter'].std()
murder_outliers=median + 2*std_dev
df['Murder and\nnonnegligent\nmanslaughter']=df['Murder and\nnonnegligent\nmanslaughter'].map(lambda x: x if x < murder_outliers else None)

median=df.Robbery.median()
std_dev=df.Robbery.std()
rob_outliers=median+2*std_dev
df['Robbery']=df.Robbery.map(lambda x: x if x < rob_outliers else None)

median=df['Property\ncrime'].median()
std_dev=df['Property\ncrime'].std()
prop_crime_outliers=median+2* std_dev
df['Property_crime']=df['Property\ncrime'].map(lambda x: x if x < prop_crime_outliers else None)

df.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3,Property_crime
0,Adams Village,1861.0,0.0,0.0,,0.0,0.0,0.0,12.0,2.0,10.0,0.0,0.0,12.0
1,Addison Town and Village,2577.0,3.0,0.0,,0.0,0.0,3.0,24.0,3.0,20.0,1.0,0.0,24.0
2,Akron Village,2846.0,3.0,0.0,,0.0,0.0,3.0,16.0,1.0,15.0,0.0,0.0,16.0
3,Albany,97956.0,791.0,8.0,,30.0,227.0,526.0,4090.0,705.0,3243.0,142.0,,4090.0
4,Albion Village,6388.0,23.0,0.0,,3.0,4.0,16.0,223.0,53.0,165.0,5.0,,223.0


In [186]:
#introduce Population squared
df['Population_Sq']=df['Population']**2

In [187]:
df['Murder']=df['Murder and\nnonnegligent\nmanslaughter'].dropna().map(lambda x:1 if x > 0 else 0)
df['Robbery']=df.Robbery.dropna().map(lambda x:1 if x > 0 else 0)
#murder and robbery now categorical features (Y/N)

In [188]:
data=df[['Population','Population_Sq','Murder','Robbery','Property_crime']].dropna()
data.head()

Unnamed: 0,Population,Population_Sq,Murder,Robbery,Property_crime
0,1861.0,3463321.0,0.0,0.0,12.0
1,2577.0,6640929.0,0.0,0.0,24.0
2,2846.0,8099716.0,0.0,0.0,16.0
3,97956.0,9595377936.0,1.0,1.0,4090.0
4,6388.0,40806544.0,0.0,1.0,223.0


## Initial Model - Simple Linear Regression

In [189]:
regr=linear_model.LinearRegression()
Y=data['Property_crime']
X=data[['Population','Population_Sq','Murder','Robbery']]
regr.fit(X, Y)

#Review the outcome
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared: \n')
print(regr.score(X, Y))


Coefficients: 
 [ 2.62320223e-02 -3.30058846e-08  1.70080551e+02  1.27606800e+01]

Intercept: 
 -70.88477571593825

R-squared: 

0.7135336089794011


## Cross Validation

In [209]:
from sklearn import cross_validation
from sklearn import model_selection

#Split data into train and test
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.25, random_state=222)
regr.fit(X_train, y_train)
y_pred=regr.predict(X_test)
accuracy=regr.score(X_test, y_test)
print('Test Accuracy:', accuracy)

#cross validation
scores=model_selection.cross_val_score(regr, X_train, y_train, cv=3)

#review outcome
print('Cross Validation\nScores:', scores)
print('Mean:', scores.mean())
print('Standard deviation:', scores.std())
RMSE_test=np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print('RMSE Test Data:', RMSE_test)

Test Accuracy: 0.6044083539343951
Cross Validation
Scores: [0.72916326 0.59135816 0.84034251]
Mean: 0.7202879784789497
Standard deviation: 0.10184098861197906
RMSE Test Data: 547.8202818329678


## Ordinary Least Squares Regression

### OLS 1

In [205]:
linear_formula='Property_crime ~ Population+Population_Sq+Murder+Robbery'

lf=smf.ols(formula=linear_formula, data=data).fit()

print('Coefficients:\n', lf.params)
print('\nP-values:\n', lf.pvalues)
print('\nR-squared:\n', lf.rsquared)
lf.conf_int()

Coefficients:
 Intercept       -70.885
Population        0.026
Population_Sq    -0.000
Murder          170.081
Robbery          12.761
dtype: float64

P-values:
 Intercept       0.016
Population      0.000
Population_Sq   0.019
Murder          0.006
Robbery         0.761
dtype: float64

R-squared:
 0.7135336089794011


Unnamed: 0,0,1
Intercept,-128.429,-13.34
Population,0.022,0.03
Population_Sq,-0.0,-0.0
Murder,48.925,291.236
Robbery,-69.581,95.102


### OLS 2

In [204]:
linear_formula2='Property_crime ~ Population+Murder+Robbery'

lf2=smf.ols(formula=linear_formula2, data=data).fit()
print('Coefficients:\n', lf2.params)
print('\nP-values:\n', lf2.pvalues)
print('\nR-squared:\n', lf2.rsquared)
lf2.conf_int()

Coefficients:
 Intercept    -53.405
Population     0.022
Murder       192.486
Robbery       44.234
dtype: float64

P-values:
 Intercept    0.062
Population   0.000
Murder       0.002
Robbery      0.269
dtype: float64

R-squared:
 0.7088744678562764


Unnamed: 0,0,1
Intercept,-109.429,2.619
Population,0.02,0.024
Murder,71.997,312.974
Robbery,-34.301,122.769


### OLS 3

In [206]:
linear_formula3='Property_crime ~ Population+Murder'

lf3=smf.ols(formula=linear_formula3, data=data).fit()
print('Coefficients:\n', lf3.params)
print('\nP-values:\n', lf3.pvalues)
print('\nR-squared:\n', lf3.rsquared)
lf3.conf_int()

Coefficients:
 Intercept    -32.989
Population     0.022
Murder       201.795
dtype: float64

P-values:
 Intercept    0.130
Population   0.000
Murder       0.001
dtype: float64

R-squared:
 0.7078266130021879


Unnamed: 0,0,1
Intercept,-75.72,9.742
Population,0.02,0.024
Murder,82.407,321.183


## Revised Model (Drop Pop_sq)

In [194]:
#Instantiate and fit the model
regr2=linear_model.LinearRegression()
Y2=data['Property_crime']
X2=data[['Population','Murder','Robbery']]
regr2.fit(X2, Y2)

#inspect the results
print('Coefficients: \n', list(zip(X2.columns, regr2.coef_)))
print('\nIntercept: \n', regr2.intercept_)
print('\nR-squared: \n', regr2.score(X2, Y2))

Coefficients: 
 [('Population', 0.02205613985643152), ('Murder', 192.48583974233398), ('Robbery', 44.23431331727557)]

Intercept: 
 -53.40501615830652

R-squared: 
 0.7088744678562764


In [210]:
#Splitting data into train and test
X2_train, X2_test, y2_train, y2_test = cross_validation.train_test_split(X2, Y2, test_size=0.25, random_state=222)
regr2.fit(X2_train, y2_train)
y2_pred=regr2.predict(X2_test)
accuracy2=regr2.score(X2_test, y2_test)
print('Test Accuracy:', accuracy2)

#cross validation on train data
scores2=model_selection.cross_val_score(regr2, X2_train, y2_train, cv=3)

print('\nCross Validation Scores:', scores2)
print('\nMean:', scores2.mean())
print('\nStandard deviation:', scores2.std())
RMSE_test2=np.sqrt(metrics.mean_squared_error(y2_test, y2_pred))
print('RMSE Test Data:', RMSE_test2)

Test Accuracy: 0.6335879299700926

Cross Validation Scores: [0.6506852  0.63870207 0.8001555 ]

Mean: 0.6965142573216911

Standard deviation: 0.07344852772227453
RMSE Test Data: 527.2291777163994


## Revised Model (Drop Pop_sq and Robbery)

In [196]:
#Instantiate and fit the model
regr3=linear_model.LinearRegression()
Y3=data['Property_crime']
X3=data[['Population','Murder']]
regr3.fit(X3, Y3)

#inspect the results
print('Coefficients: \n', list(zip(X3.columns, regr3.coef_)))
print('\nIntercept: \n', regr3.intercept_)
print('\nR-squared: \n', regr3.score(X3, Y3))

Coefficients: 
 [('Population', 0.02237984425144918), ('Murder', 201.7950262766855)]

Intercept: 
 -32.988681606991804

R-squared: 
 0.7078266130021879


In [202]:
#Splitting data into train and test
X3_train, X3_test, y3_train, y3_test=cross_validation.train_test_split(X3, Y3, test_size=0.25, random_state=222)
regr3.fit(X3_train, y3_train)
y3_pred=regr3.predict(X3_test)
accuracy3=regr3.score(X3_test, y3_test)
print('Accuracy of Test Data:', accuracy3)

#cross validation on train data
scores3=model_selection.cross_val_score(regr3, X3_train, y3_train, cv=3)

print('\nCross Validation Scores:', scores3)
print('\nMean:', scores3.mean())
print('\nStandard deviation:', scores3.std())
RMSE_test3=np.sqrt(metrics.mean_squared_error(y3_test, y3_pred))
print('RMSE Test Data:', RMSE_test3)

Accuracy of Test Data: 0.6443131995126482

Cross Validation Scores: [0.63499857 0.63392671 0.7982762 ]

Mean: 0.6890671608817147

Standard deviation: 0.07722368962328911
RMSE Test Data: 519.4555916179131


## Validating Model with 2014 Data

In [198]:
#Preparing Data and features for 2014 dataset
df_2014['Murder']=df_2014['Murder and\nnonnegligent\nmanslaughter'].dropna().map(lambda x: 1 if x > 0 else 0)
df_2014['Robbery']=df_2014.Robbery.dropna().map(lambda x: 1 if x > 0 else 0)
df_2014['Property_crime']=df_2014['Property\ncrime']
data_2014=df_2014[['Population', 'Murder', 'Robbery', 'Property_crime']].dropna()
data_2014.head(15)

Unnamed: 0,Population,Murder,Robbery,Property_crime
0,1851.0,0.0,0.0,11.0
1,2568.0,0.0,1.0,49.0
2,820.0,0.0,0.0,1.0
3,2842.0,0.0,0.0,17.0
4,98595.0,1.0,1.0,3888.0
5,5872.0,0.0,1.0,204.0
6,1107.0,0.0,0.0,7.0
7,4032.0,1.0,0.0,30.0
8,1723.0,0.0,0.0,2.0
9,118860.0,1.0,1.0,2066.0


In [208]:
#using second linear regression to predict 2014 data
y_pred_2014=lf2.predict(data_2014[['Population','Murder','Robbery']])

RMSE_2014=np.sqrt(metrics.mean_squared_error(data_2014['Property_crime'], y_pred_2014))
print('RMSE New York 2014 Data:', RMSE_2014)

RMSE New York 2014 Data: 2723.3796965365273
