In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import pandas as pd
import sys
import platform
import sklearn
import statsmodels.api as sm

print(sys.version_info)
print("Python Version: " + str(platform.python_version()))

sys.version_info(major=3, minor=6, micro=8, releaselevel='final', serial=0)
Python Version: 3.6.8


In [10]:
from sklearn.feature_selection import *
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [35]:
#path = "/Users/zhiwei/Documents/Spring-2019/STAT-222/sf-crime/"
crime_data = pd.read_csv("../../data/data_include_all_features.csv")
crime_data = crime_data[crime_data.year<2017]
crime_data.shape

(1562, 176)

In [36]:
crime_data.columns.values

array(['GEOID', 'year', 'quarter', 'N', 'N_calls_311', 'lat', 'lng',
       'N_housing', 'V1', 'Estimate_Total.x', 'prop_rented', 'prop_male',
       'prop_african_american', 'prop_under_poverty_level',
       'prop_vacant_houses', 'prop_stable', 'racial_index',
       'income_index', 'age_index', 'working_class', 'land', 'water',
       'Estimate_Total.y', 'Female', 'Female_10_to_14_years',
       'Female_10th_grade', 'Female_11th_grade',
       'Female_12th_grade_no_diploma', 'Female_15_to_17_years',
       'Female_18_and_19_years', 'Female_20_years', 'Female_21_years',
       'Female_22_to_24_years', 'Female_25_to_29_years',
       'Female_30_to_34_years', 'Female_35_to_39_years',
       'Female_40_to_44_years', 'Female_45_to_49_years',
       'Female_5_to_9_years', 'Female_50_to_54_years',
       'Female_55_to_59_years', 'Female_5th_and_6th_grade',
       'Female_60_and_61_years', 'Female_62_to_64_years',
       'Female_65_and_66_years', 'Female_67_to_69_years',
       'Female_70_t

# Select features according to the k highest scores

In [48]:
X = crime_data.iloc[:,4:170]
y = crime_data.N

#apply SelectKBest class to extract top 20 best features
bestfeatures = SelectKBest(score_func=f_regression, k=20)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(20,'Score'))  #print 20 best features

                                                 Specs        Score
89   Income_in_the_past_12_months_below_poverty_lev...  2341.283489
85   Income_in_the_past_12_months_below_poverty_lev...  1542.452875
0                                          N_calls_311  1470.594024
79    Income_in_the_past_12_months_below_poverty_level   915.512423
159                         With_income_10000_to_14999   864.855715
153                                    Renter_occupied   786.048537
141                      Moved_within_same_county_Male   674.957762
90                                                Male   636.501017
14                                           age_index   605.178816
22                                   Female_11th_grade   525.035923
135  Moved_from_different_county_within_same_state_...   519.434659
106                                Male_50_to_54_years   517.696430
139                           Moved_within_same_county   468.974796
103                                Male_40_to_44

array(['Income_in_the_past_12_months_below_poverty_level_Male',
       'Income_in_the_past_12_months_below_poverty_level_Female_In_labor_force',
       'GEOID'], dtype=object)

In [38]:
#Then use only number of thefts as y
X = crime_data.iloc[:,4:170][-np.isnan(crime_data.N_theft)]
y = crime_data.N_theft[-np.isnan(crime_data.N_theft)]

#apply SelectKBest class to extract top 20 best features
bestfeatures = SelectKBest(score_func=f_regression, k=20)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(20,'Score'))  #print 20 best features

                                                 Specs        Score
89   Income_in_the_past_12_months_below_poverty_lev...  1619.419726
0                                          N_calls_311  1175.872340
85   Income_in_the_past_12_months_below_poverty_lev...   898.982013
141                      Moved_within_same_county_Male   841.761179
135  Moved_from_different_county_within_same_state_...   815.803150
14                                           age_index   794.148685
153                                    Renter_occupied   735.577756
90                                                Male   696.643842
133      Moved_from_different_county_within_same_state   623.847363
139                           Moved_within_same_county   614.820240
103                                Male_40_to_44_years   595.238557
6                                          prop_rented   584.030056
136                         Moved_from_different_state   582.444623
102                                Male_35_to_39

In [39]:
#Number of robberies as y
X = crime_data.iloc[:,4:170][-np.isnan(crime_data.N_robbery)]
y = crime_data.N_robbery[-np.isnan(crime_data.N_robbery)]

#apply SelectKBest class to extract top 20 best features
bestfeatures = SelectKBest(score_func=f_regression, k=20)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(20,'Score'))  #print 20 best features

                                                 Specs        Score
89   Income_in_the_past_12_months_below_poverty_lev...  1840.415422
85   Income_in_the_past_12_months_below_poverty_lev...  1458.834980
159                         With_income_10000_to_14999   907.981625
79    Income_in_the_past_12_months_below_poverty_level   785.363755
0                                          N_calls_311   699.307168
22                                   Female_11th_grade   524.860506
153                                    Renter_occupied   477.222925
104                                Male_45_to_49_years   418.141331
122   Male_High_school_graduate_(includes_equivalency)   365.482859
93                                     Male_11th_grade   341.563618
90                                                Male   330.683160
106                                Male_50_to_54_years   326.118720
39                               Female_62_to_64_years   305.984915
109                               Male_60_and_61

In [15]:
#Number of assualts as y
# X = crime_data.iloc[:,4:170][-np.isnan(crime_data.N_assualt)]
# y = crime_data.N_assualt[-np.isnan(crime_data.N_assualt)]
crime_assualt = pd.read_csv("../../data/crime_assualt_data.csv")
crime_assualt = crime_assualt[crime_assualt.year<2017]

X = crime_assualt.iloc[:,6:183]
y = crime_assualt.crime

# standardizing features
X_scaler = StandardScaler()
X_scaler.fit(X)
X_scale = X_scaler.transform(X)
X_scale = pd.DataFrame(X_scale, columns=X.columns)

#apply SelectKBest class to extract top 20 best features
bestfeatures = SelectKBest(score_func=f_regression, k=20)
fit = bestfeatures.fit(X_scale,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_scale.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(20,'Score'))  #print 20 best features
#print(X.columns.values[[70,66,60,140,161,103,67,122,134,65]])

                                                 Specs        Score
70   Income_in_the_past_12_months_below_poverty_lev...  3807.483306
66   Income_in_the_past_12_months_below_poverty_lev...  3664.761506
60    Income_in_the_past_12_months_below_poverty_level  2363.200815
140                         With_income_10000_to_14999  1894.075160
161                           prop_under_poverty_level  1325.716854
103   Male_High_school_graduate_(includes_equivalency)  1149.602393
67   Income_in_the_past_12_months_below_poverty_lev...  1024.132900
122                      Moved_within_same_county_Male   956.889149
134                                    Renter_occupied   932.412842
65   Income_in_the_past_12_months_below_poverty_lev...   927.661230
69   Income_in_the_past_12_months_below_poverty_lev...   926.446118
74                                     Male_11th_grade   908.227230
126  Not_Hispanic_or_Latino_Black_or_African_Americ...   781.124652
61   Income_in_the_past_12_months_below_poverty_

  return self.partial_fit(X, y)
  del sys.path[0]


In [41]:
#Number of vehicle thefts as y
X = crime_data.iloc[:,4:170][-np.isnan(crime_data.N_vehicle_theft)]
y = crime_data.N_vehicle_theft[-np.isnan(crime_data.N_vehicle_theft)]

#apply SelectKBest class to extract top 20 best features
bestfeatures = SelectKBest(score_func=f_regression, k=20)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(20,'Score'))  #print 20 best features

                                                 Specs        Score
0                                          N_calls_311  1061.366849
103                                Male_40_to_44_years   492.560492
90                                                Male   486.411329
76   Income_in_the_past_12_months_at_or_above_pover...   427.497589
14                                           age_index   425.021969
75   Income_in_the_past_12_months_at_or_above_pover...   422.997936
74   Income_in_the_past_12_months_at_or_above_pover...   362.686278
102                                Male_35_to_39_years   358.318390
106                                Male_50_to_54_years   353.880428
157                                        With_income   352.008799
156                         Same_house_1_year_ago_Male   332.331611
2                                                  lng   330.134682
89   Income_in_the_past_12_months_below_poverty_lev...   304.439091
5                                     Estimate_T

In [42]:
#Number of burglaries as y
X = crime_data.iloc[:,4:170][-np.isnan(crime_data.N_burglary)]
y = crime_data.N_burglary[-np.isnan(crime_data.N_burglary)]

#apply SelectKBest class to extract top 20 best features
bestfeatures = SelectKBest(score_func=f_regression, k=20)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(20,'Score'))  #print 20 best features

                                                 Specs        Score
89   Income_in_the_past_12_months_below_poverty_lev...  1092.997957
0                                          N_calls_311   956.343072
85   Income_in_the_past_12_months_below_poverty_lev...   694.455640
153                                    Renter_occupied   682.925516
141                      Moved_within_same_county_Male   670.520501
135  Moved_from_different_county_within_same_state_...   632.683357
14                                           age_index   629.843258
136                         Moved_from_different_state   552.193811
90                                                Male   513.964513
139                           Moved_within_same_county   513.086913
138                    Moved_from_different_state_Male   512.784028
101                                Male_30_to_34_years   478.315024
133      Moved_from_different_county_within_same_state   478.265161
6                                          prop_

# Recursive Feature Elimination

In [43]:
#Recursive Feature Elimination
cols = list(X.columns)
model = LinearRegression()

#Initializing RFE model
rfe = RFE(model, 20)    

#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  

#Fitting the data to model
model.fit(X_rfe,y)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

Index(['lat', 'lng', 'prop_rented', 'prop_male', 'prop_african_american',
       'prop_under_poverty_level', 'prop_stable', 'racial_index',
       'working_class', 'Female_11th_grade', 'Female_20_years',
       'Female_62_to_64_years', 'Hispanic_or_Latino_Asian_alone',
       'Male_10th_grade', 'Male_11th_grade', 'Male_12th_grade_no_diploma',
       'Male_60_and_61_years', 'Male_80_to_84_years',
       'Not_Hispanic_or_Latino_American_Indian_and_Alaska_Native_alone',
       'Not_Hispanic_or_Latino_Two_or_more_races_Two_races_including_Some_other_race'],
      dtype='object')


# Backward Elimination

In [44]:
#Backward Elimination
X = crime_data.iloc[:,4:170]
y = crime_data.N
cols = list(X.columns)
pmax = 1
while (len(cols)>0):
    p= []
    X_1 = X[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y,X_1).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.01):
        cols.remove(feature_with_p_max)
    else:
        break
selected_features_BE = cols
print(selected_features_BE)

  return ptp(axis=axis, out=out, **kwargs)


['N_calls_311', 'lat', 'lng', 'V1', 'prop_rented', 'prop_african_american', 'prop_under_poverty_level', 'income_index', 'age_index', 'working_class', 'land', 'water', 'Female', 'Female_10th_grade', 'Female_11th_grade', 'Female_50_to_54_years', 'Female_60_and_61_years', 'Female_62_to_64_years', 'Female_70_to_74_years', 'Female_7th_and_8th_grade', 'Female_85_years_and_over', "Female_Master's_degree", 'Female_No_schooling_completed', 'Female_Under_5_years', 'Hispanic_or_Latino_American_Indian_and_Alaska_Native_alone', 'Hispanic_or_Latino_Two_or_more_races_Two_races_including_Some_other_race', 'Hispanic_or_Latino_White_alone', 'Income_in_the_past_12_months_at_or_above_poverty_level_Female', 'Income_in_the_past_12_months_at_or_above_poverty_level_Female_In_labor_force_Employed', 'Income_in_the_past_12_months_at_or_above_poverty_level_Male', 'Income_in_the_past_12_months_at_or_above_poverty_level_Male_In_labor_force', 'Income_in_the_past_12_months_at_or_above_poverty_level_Male_In_labor_forc

In [45]:
model = sm.OLS(y,X[selected_features_BE]).fit()
model.summary()

0,1,2,3
Dep. Variable:,N,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.949
Method:,Least Squares,F-statistic:,458.7
Date:,"Mon, 06 May 2019",Prob (F-statistic):,0.0
Time:,17:33:48,Log-Likelihood:,-9225.4
No. Observations:,1562,AIC:,18580.0
Df Residuals:,1499,BIC:,18910.0
Df Model:,63,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
N_calls_311,0.1569,0.008,18.723,0.000,0.140,0.173
lat,837.9814,194.123,4.317,0.000,457.200,1218.763
lng,267.7464,59.683,4.486,0.000,150.676,384.817
V1,-0.0500,0.009,-5.326,0.000,-0.068,-0.032
prop_rented,3010.2382,293.893,10.243,0.000,2433.753,3586.723
prop_african_american,-573.2782,77.262,-7.420,0.000,-724.831,-421.725
prop_under_poverty_level,593.4016,87.631,6.772,0.000,421.510,765.293
income_index,-0.2627,0.026,-10.030,0.000,-0.314,-0.211
age_index,-0.0991,0.031,-3.215,0.001,-0.160,-0.039

0,1,2,3
Omnibus:,343.403,Durbin-Watson:,0.806
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2658.6
Skew:,0.807,Prob(JB):,0.0
Kurtosis:,9.184,Cond. No.,1.29e+16
