In [1]:
import pandas as pd
from bokeh.plotting import figure, output_notebook, show
from sklearn import linear_model
# output to static HTML file
import numpy as np
output_notebook()

In [2]:
sales = pd.read_csv('Philadelphia_Crime_Rate_noNA.csv')

In [3]:
sales.head()

Unnamed: 0,HousePrice,"HsPrc ($10,000)",CrimeRate,MilesPhila,PopChg,Name,County
0,140463,14.0463,29.7,10.0,-1.0,Abington,Montgome
1,113033,11.3033,24.1,18.0,4.0,Ambler,Montgome
2,124186,12.4186,19.5,25.0,8.0,Aston,Delaware
3,110490,11.049,49.4,25.0,2.7,Bensalem,Bucks
4,79124,7.9124,54.1,19.0,3.9,Bristol B.,Bucks


In [4]:

p = figure(plot_width=400, plot_height=400)

# add a circle renderer with a size, color, and alpha
p.circle(x= sales.CrimeRate, y= sales.HousePrice, size=5, color="navy", alpha=0.5)

# show the results
show(p)

In [5]:
crime_model = linear_model.LinearRegression(copy_X=False,n_jobs=-1)
# Train the model using the training sets
crime_model.fit(np.array(sales.CrimeRate).reshape(-1, 1), np.array(sales.HousePrice).reshape(-1, 1))

p = figure(plot_width=400, plot_height=400)

# add a circle renderer with a size, color, and alpha
p.circle(x= sales.CrimeRate, y= sales.HousePrice, size=5, color="navy", alpha=0.5)
p.line(x= sales.CrimeRate,y= pd.DataFrame(crime_model.predict(np.array(sales.CrimeRate).reshape(-1,1)))[0] ,color ='green' )
# show the results
show(p)

print('---- Coefficient -----')
pd.DataFrame({'name': pd.Series(['(intercept)','CrimeRate']),
              'value':  pd.Series([float(crime_model.intercept_) ,float(crime_model.coef_)])})

---- Coefficient -----


Unnamed: 0,name,value
0,(intercept),176629.408107
1,CrimeRate,-576.908128


In [6]:
sales_noCC = sales[sales['MilesPhila'] != 0.0] 

In [7]:
crime_model = linear_model.LinearRegression(copy_X=False,n_jobs=-1)
# Train the model using the training sets
crime_model.fit(np.array(sales_noCC.CrimeRate).reshape(-1, 1), np.array(sales_noCC.HousePrice).reshape(-1, 1))

p = figure(plot_width=400, plot_height=400)

# add a circle renderer with a size, color, and alpha
p.circle(x= sales_noCC.CrimeRate, y= sales_noCC.HousePrice, size=5, color="navy", alpha=0.5)
p.line(x= sales_noCC.CrimeRate,y= pd.DataFrame(crime_model.predict(np.array(sales_noCC.CrimeRate).reshape(-1,1)))[0] ,color ='green' )
# show the results
show(p)

print('---- Coefficient -----')
pd.DataFrame({'name': pd.Series(['(intercept)','CrimeRate']),
              'value':  pd.Series([float(crime_model.intercept_) ,float(crime_model.coef_)])})

---- Coefficient -----


Unnamed: 0,name,value
0,(intercept),225233.551839
1,CrimeRate,-2288.68943


#### Remove high end points

In [10]:
sales_nohighend = sales_noCC[sales_noCC['HousePrice'] < 350000] 
crime_model_nohighend = linear_model.LinearRegression(copy_X=False,n_jobs=-1)
crime_model_nohighend.fit(np.array(sales_nohighend.CrimeRate).reshape(-1, 1),
                          np.array(sales_nohighend.HousePrice).reshape(-1, 1))
print('---- Coefficient -----')
pd.DataFrame({'name': pd.Series(['(intercept)','CrimeRate']),
              'value':  pd.Series([float(crime_model_nohighend.intercept_) ,float(crime_model_nohighend.coef_)])})

---- Coefficient -----


Unnamed: 0,name,value
0,(intercept),199098.85267
1,CrimeRate,-1838.562649
