In [27]:
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlopen
from sklearn import datasets, linear_model
from scipy.optimize import curve_fit
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing
import statsmodels.api as sm 
import os

# County Level EDA

## 1. Median ZHVI wrt. crime rate

#### Results Summary:
Crime rate could predict counties with low GDP per capita. It's not a good factor for counties with high GDP per capita.

### 1.1 Correlation: Median ZHVI  with crime rate -- counties with high GDP per capita

Panel regression: choose year = 2014, choose state = CA, California

In [5]:
path = '/Users/liusulin/Desktop/Data Science/Project2/'

In [78]:
# read California county-level crime rate, and store the crime rate of county
columns = ["Agency", "Population", "Violent crime total"]
crime_CA_df = pd.read_csv(path + 'CACrimeOneYearofData.csv', usecols=columns, sep = ",").dropna()
CAcounty_crime_train = crime_CA_df.sort_values(by=["Population"])[['Violent crime total']][:30]
CAcounty_crime_test = crime_CA_df.sort_values(by=["Population"])[['Violent crime total']][30:45]
#normalize
CAcounty_crime_train = preprocessing.scale(CAcounty_crime_train)
CAcounty_crime_test = preprocessing.scale(CAcounty_crime_test)

In [79]:
median_df = pd.read_csv(path + 'County_MedianValuePerSqft_AllHomes.csv',encoding = "ISO-8859-1",index_col = None)
median_df
CAcounty_median = median_df.loc[median_df['State'] == 'CA'].sort_values(by = ['SizeRank'])[['2014-12']]
CAcounty_median_train = CAcounty_median[:30]
CAcounty_median_test = CAcounty_median[30:45]
#normalize
CAcounty_median_train = preprocessing.scale(CAcounty_median_train)
CAcounty_median_test = preprocessing.scale(CAcounty_median_test)

In [83]:
regr = linear_model.LinearRegression()

## Train the model using the training sets
regr.fit(CAcounty_crime_train, CAcounty_median_train)

# # Make predictions using the testing set
CAcounty_median_pred = regr.predict(CAcounty_crime_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(CAcounty_median_test, CAcounty_median_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(CAcounty_median_test, CAcounty_median_pred))


Coefficients: 
 [[ 0.02292069]]
Mean squared error: 1.02
Variance score: -0.02


In [86]:
CAcounty_crime_train_const = sm.add_constant(CAcounty_crime_train)
models = sm.OLS(CAcounty_median_train, CAcounty_crime_train_const)
result = models.fit()
result.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.035
Method:,Least Squares,F-statistic:,0.01472
Date:,"Fri, 30 Nov 2018",Prob (F-statistic):,0.904
Time:,21:26:17,Log-Likelihood:,-42.56
No. Observations:,30,AIC:,89.12
Df Residuals:,28,BIC:,91.92
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.696e-16,0.189,8.98e-16,1.000,-0.387,0.387
x1,0.0229,0.189,0.121,0.904,-0.364,0.410

0,1,2,3
Omnibus:,14.571,Durbin-Watson:,1.688
Prob(Omnibus):,0.001,Jarque-Bera (JB):,14.932
Skew:,1.445,Prob(JB):,0.000572
Kurtosis:,4.896,Cond. No.,1.0


#### Conclusion: For counties in high GDP state, the crime rate and house price is not significantly correlated
Possible reasons:
These areas are highly populated. When people choose their living areas, they may consider more on traffic, children's education, rather than crime rate.

### 1.2 Correlation: Median ZHVI  with crime rate -- counties with low GDP per capita

Panel regression: choose year = 2014, choose state = AZ, Arizona

In [102]:
# read Arizona county-level crime rate, and store the crime rate of county
columns = ["Agency", "Population", "Violent crime total"]
crime_AZ_df = pd.read_csv(path + 'AZCrimeOneYearofData.csv', usecols=columns, sep = ",").dropna()
AZcounty_crime = crime_AZ_df[['Violent crime total']]
#normalize
AZcounty_crime = preprocessing.scale(AZcounty_crime_train)[:14]

In [100]:
AZcounty_median = median_df.loc[median_df['State'] == 'AZ'].sort_values(by = ['SizeRank'])[['2014-12']]
AZcounty_median = AZcounty_median[:10]

In [101]:
AZcounty_crime_train_const = sm.add_constant(AZcounty_crime_train)
models = sm.OLS(AZcounty_median, AZcounty_crime_train_const)
result = models.fit()
result.summary()

  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,2014-12,R-squared:,0.075
Model:,OLS,Adj. R-squared:,-0.041
Method:,Least Squares,F-statistic:,0.6444
Date:,"Fri, 30 Nov 2018",Prob (F-statistic):,0.445
Time:,21:49:10,Log-Likelihood:,-45.735
No. Observations:,10,AIC:,95.47
Df Residuals:,8,BIC:,96.07
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,103.7000,8.288,12.512,0.000,84.588,122.812
x1,-6.6531,8.288,-0.803,0.445,-25.766,12.459

0,1,2,3
Omnibus:,3.557,Durbin-Watson:,2.569
Prob(Omnibus):,0.169,Jarque-Bera (JB):,1.471
Skew:,0.939,Prob(JB):,0.479
Kurtosis:,3.078,Cond. No.,1.0


#### Conclusion: For counties in low GDP state, the crime rate and house price is significantly negatively correlated
Possible reasons:
