# PART 5: Regression

Haoyu Yue, Department of Urban Design and Planning, University of Washington

## Preparation

In [77]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
import contextily as ctx
from matplotlib_scalebar.scalebar import ScaleBar
from matplotlib.patches import Patch
import math
import statsmodels.api as sm

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [567]:
sf_demo_gen_pattern = gpd.read_file('data/regression/sf_demo_gen_pattern.geojson')
sf_invest = gpd.read_file('data/regression/sf_invest.geojson')

In [568]:
invest_types = ['research and planning', 'fire prevention', 'climate action',
       'public transportation', 'housing', 'green space', 'others', 'vehicle',
       'building', 'utilities', 'agriculture', 'clean air']

In [608]:
logistic_data = sf_invest.groupby('GEOID')[invest_types].sum()

In [609]:
popu_area = sf_demo_gen_pattern.popu_2020 / 1000 * sf_demo_gen_pattern.area / 1000000
popu_area.index = logistic_data.index

In [610]:
for i in invest_types:
    logistic_data.loc[:,i] = (logistic_data.loc[:,i]/popu_area).apply(np.log).replace(-np.inf,0) #((logistic_data.loc[:,i]+1).apply(np.log)/popu_area)  

In [611]:
logistic_data = logistic_data.merge(sf_demo_gen_pattern[['GEOID','gentrified']],left_index=True,right_on='GEOID').replace(['Gentrified','Non-Gentrified'],[1,0])

In [612]:
logistic_data = logistic_data.set_index('GEOID')

In [613]:
logistic_data

Unnamed: 0_level_0,research and planning,fire prevention,climate action,public transportation,housing,green space,others,vehicle,building,utilities,agriculture,clean air,gentrified
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
060014001001,0.0,10.162304,9.471991,14.245763,0.0,0.000000,0.000000,13.717784,0.0,0.000000,0.0,0.0,0
060014002001,0.0,0.000000,4.750078,9.813422,0.0,0.000000,0.000000,12.091623,0.0,7.874392,0.0,0.0,0
060014002002,0.0,0.000000,6.026793,11.181073,0.0,0.000000,0.000000,13.307718,0.0,10.960395,0.0,0.0,0
060014003001,0.0,0.000000,4.880356,10.004151,0.0,0.000000,0.000000,12.302858,0.0,8.288054,0.0,0.0,0
060014003002,0.0,0.000000,5.768792,11.121144,0.0,0.000000,0.000000,13.109378,0.0,10.776926,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
060952521021,0.0,0.000000,4.581980,8.317108,0.0,0.000000,6.835333,13.472421,0.0,0.000000,0.0,0.0,1
060971507023,0.0,0.000000,0.000000,9.902931,0.0,7.115122,1.402460,11.855891,0.0,0.000000,0.0,0.0,0
060971508004,0.0,0.000000,0.000000,13.047788,0.0,10.259979,4.085448,14.927394,0.0,0.000000,0.0,0.0,1
060971511001,0.0,0.000000,0.000000,0.000000,0.0,10.072836,0.608868,14.813605,0.0,0.000000,0.0,0.0,0


In [614]:
logistic_list = ['fire prevention', 'climate action',
       'public transportation', 'housing', 'green space', 'vehicle',
       'building', 'utilities',  'gentrified']
logistic_data = logistic_data[logistic_list]

In [615]:
sf_demo_gen_pattern['white_rate'] = (sf_demo_gen_pattern['white_20']/sf_demo_gen_pattern['popu_2020'])/(sf_demo_gen_pattern['white_15']/sf_demo_gen_pattern['popu_2015'])-1
sf_demo_gen_pattern['edu_rate'] = (sf_demo_gen_pattern['edu_2020'])/(sf_demo_gen_pattern['edu_2015'])-1
sf_demo_gen_pattern['popu_rate'] = (sf_demo_gen_pattern['popu_2020'])/(sf_demo_gen_pattern['popu_2015'])-1
sf_demo_gen_pattern['income_rate'] = (sf_demo_gen_pattern['Median HH Income (in 2020 dollars)'])/(sf_demo_gen_pattern['Median HH Income (in 2015 dollars)'])-1
sf_demo_gen_pattern['rent_rate'] = (sf_demo_gen_pattern['Median Gross Rent 2020'])/(sf_demo_gen_pattern['Median Gross Rent 2015'])-1
sf_demo_gen_pattern['white_rate_15'] = sf_demo_gen_pattern['white_15']/sf_demo_gen_pattern['popu_2015']
sf_demo_gen_pattern['density'] = sf_demo_gen_pattern['popu_2015']/sf_demo_gen_pattern['area']


In [616]:
ols_data = logistic_data.merge(sf_demo_gen_pattern[['white_rate','edu_rate','popu_rate','income_rate','rent_rate','GEOID']],left_index=True,right_on='GEOID',how='left')
logistic_data = logistic_data.merge(sf_demo_gen_pattern[['white_rate_15','density','edu_2015','Median Gross Rent 2015','GEOID']],left_index=True,right_on='GEOID',how='left')


In [617]:
logistic_data = logistic_data.replace(np.inf,np.nan).dropna()
logistic_data['Median Gross Rent 2015'] = logistic_data['Median Gross Rent 2015'].apply(np.log)

## occupied data

In [618]:
occupied = pd.read_csv('data/ACS_5Y/occupy/occupy.csv')

In [619]:
occupied['geoid'] = occupied['GEOID'].str.slice(-12,)
occupied['ownership_rate'] = occupied['Owner occupied']/occupied['Total']
occupied = occupied[['geoid','ownership_rate']]

In [620]:
logistic_data = logistic_data.merge(occupied,left_on='GEOID',right_on='geoid',how='left')
ols_data = ols_data.merge(occupied,left_on='GEOID',right_on='geoid',how='left')

In [621]:
potential_area = pd.read_csv('potential_area.csv')
potential_area_list = list(potential_area['0'])

In [622]:
logistic_data = logistic_data[logistic_data.index.isin(potential_area_list)]

In [623]:
ols_data['gen_index'] = ((
    ols_data.edu_rate.mean() - ols_data.edu_rate)/(ols_data.edu_rate.std()) + (
        ols_data.income_rate.mean() - ols_data.income_rate)/(ols_data.income_rate.std()) + (
        ols_data.rent_rate.mean() - ols_data.rent_rate)/(ols_data.rent_rate.std()))/3

In [624]:
ols_data = ols_data.dropna()

In [625]:
logistic_data = logistic_data[['fire prevention', 'climate action', 'public transportation', 'housing',
       'green space', 'vehicle', 'building', 'utilities', 'gentrified',
       'white_rate_15','edu_2015', 'Median Gross Rent 2015','density',
       'GEOID', 'ownership_rate']]

In [626]:
logistic_data

Unnamed: 0,fire prevention,climate action,public transportation,housing,green space,vehicle,building,utilities,gentrified,white_rate_15,edu_2015,Median Gross Rent 2015,density,GEOID,ownership_rate
0,10.162304,9.471991,14.245763,0.0,0.000000,13.717784,0.000000,0.000000,0,0.763889,17.204545,8.064486,0.000266,060014001001,0.855365
2,0.000000,6.026793,11.181073,0.0,0.000000,13.307718,0.000000,10.960395,0,0.842278,16.822768,7.514756,0.001904,060014002002,0.577428
3,0.000000,4.880356,10.004151,0.0,0.000000,12.302858,0.000000,8.288054,0,0.675389,17.178357,7.279622,0.003313,060014003001,0.609856
6,0.000000,5.873636,11.091920,0.0,0.000000,13.208295,0.000000,10.859604,0,0.765212,16.162646,6.871548,0.003896,060014003004,0.603723
11,0.000000,3.779656,9.079782,0.0,0.000000,11.816639,0.000000,8.588065,0,0.607242,15.592295,7.390181,0.003292,060014005002,0.256140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2537,0.000000,6.972229,13.928216,0.0,12.069987,16.728282,0.000000,0.000000,0,0.355375,14.817333,7.679362,0.002542,060816140001,0.583241
2541,0.000000,-2.432063,12.908133,0.0,0.000000,14.914493,12.969339,0.000000,0,0.241149,13.052471,7.728557,0.035171,060855044222,0.842000
2543,0.000000,-4.918769,11.564936,0.0,0.000000,13.552258,11.626142,0.000000,1,0.193029,12.497337,7.389515,0.105516,060855045063,0.478535
2548,9.025959,0.000000,0.000000,0.0,9.247855,13.906150,0.000000,0.000000,0,0.880231,17.108997,7.745055,0.009323,060871205001,0.840989


In [628]:
logistic_data = logistic_data[logistic_data.density > 0.001]
logistic_data

Unnamed: 0,fire prevention,climate action,public transportation,housing,green space,vehicle,building,utilities,gentrified,white_rate_15,edu_2015,Median Gross Rent 2015,density,GEOID,ownership_rate
2,0.000000,6.026793,11.181073,0.0,0.000000,13.307718,0.000000,10.960395,0,0.842278,16.822768,7.514756,0.001904,060014002002,0.577428
3,0.000000,4.880356,10.004151,0.0,0.000000,12.302858,0.000000,8.288054,0,0.675389,17.178357,7.279622,0.003313,060014003001,0.609856
6,0.000000,5.873636,11.091920,0.0,0.000000,13.208295,0.000000,10.859604,0,0.765212,16.162646,6.871548,0.003896,060014003004,0.603723
11,0.000000,3.779656,9.079782,0.0,0.000000,11.816639,0.000000,8.588065,0,0.607242,15.592295,7.390181,0.003292,060014005002,0.256140
14,0.000000,4.404000,10.809239,0.0,0.000000,12.434161,0.000000,9.545845,1,0.508097,15.515625,6.951160,0.002343,060014006002,0.641148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2537,0.000000,6.972229,13.928216,0.0,12.069987,16.728282,0.000000,0.000000,0,0.355375,14.817333,7.679362,0.002542,060816140001,0.583241
2541,0.000000,-2.432063,12.908133,0.0,0.000000,14.914493,12.969339,0.000000,0,0.241149,13.052471,7.728557,0.035171,060855044222,0.842000
2543,0.000000,-4.918769,11.564936,0.0,0.000000,13.552258,11.626142,0.000000,1,0.193029,12.497337,7.389515,0.105516,060855045063,0.478535
2548,9.025959,0.000000,0.000000,0.0,9.247855,13.906150,0.000000,0.000000,0,0.880231,17.108997,7.745055,0.009323,060871205001,0.840989


In [633]:
ols = logistic_data.merge(ols_data[['GEOID','gen_index']],how='left',on='GEOID')
ols.columns

Index(['fire prevention', 'climate action', 'public transportation', 'housing',
       'green space', 'vehicle', 'building', 'utilities', 'gentrified',
       'white_rate_15', 'edu_2015', 'Median Gross Rent 2015', 'density',
       'GEOID', 'ownership_rate', 'gen_index'],
      dtype='object')

In [647]:
ols

Unnamed: 0,fire prevention,climate action,public transportation,housing,green space,vehicle,building,utilities,gentrified,white_rate_15,edu_2015,Median Gross Rent 2015,density,GEOID,ownership_rate,gen_index
0,0.000000,6.026793,11.181073,0.0,0.000000,13.307718,0.000000,10.960395,0,0.842278,16.822768,7.514756,0.001904,060014002002,0.577428,0.170236
1,0.000000,4.880356,10.004151,0.0,0.000000,12.302858,0.000000,8.288054,0,0.675389,17.178357,7.279622,0.003313,060014003001,0.609856,-0.351731
2,0.000000,5.873636,11.091920,0.0,0.000000,13.208295,0.000000,10.859604,0,0.765212,16.162646,6.871548,0.003896,060014003004,0.603723,-0.849576
3,0.000000,3.779656,9.079782,0.0,0.000000,11.816639,0.000000,8.588065,0,0.607242,15.592295,7.390181,0.003292,060014005002,0.256140,0.067271
4,0.000000,4.404000,10.809239,0.0,0.000000,12.434161,0.000000,9.545845,1,0.508097,15.515625,6.951160,0.002343,060014006002,0.641148,-1.466282
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805,0.000000,6.972229,13.928216,0.0,12.069987,16.728282,0.000000,0.000000,0,0.355375,14.817333,7.679362,0.002542,060816140001,0.583241,-0.215738
806,0.000000,-2.432063,12.908133,0.0,0.000000,14.914493,12.969339,0.000000,0,0.241149,13.052471,7.728557,0.035171,060855044222,0.842000,0.004756
807,0.000000,-4.918769,11.564936,0.0,0.000000,13.552258,11.626142,0.000000,1,0.193029,12.497337,7.389515,0.105516,060855045063,0.478535,-0.942245
808,9.025959,0.000000,0.000000,0.0,9.247855,13.906150,0.000000,0.000000,0,0.880231,17.108997,7.745055,0.009323,060871205001,0.840989,1.490150


In [652]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = logistic_data.drop(["gentrified",'GEOID'],1)
y = logistic_data["gentrified"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)


  X = logistic_data.drop(["gentrified",'GEOID'],1)


In [559]:
from numpy import random
random.seed(3)

clf = linear_model.LogisticRegressionCV(cv = 5, Cs=[0.001,0.005,0.01, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 20, 50],max_iter=1000)
LogisticCV = clf.fit(X=X_train,y=y_train)

y_pred_train = LogisticCV.predict(X_train)

print('Intercept:', LogisticCV.intercept_)
print('Coefficients:',LogisticCV.coef_)
print('Penalty value', LogisticCV.C_)
print('Training Accuracy Score',accuracy_score(y_train,y_pred_train))

Intercept: [-0.28776355]
Coefficients: [[-1.17307989e-02 -9.00708385e-03  4.78273066e-02  2.35501292e-02
  -2.94752460e-03 -4.04345949e-03  3.46750665e-03  1.79701234e-02
  -4.93870138e-03 -9.59556472e-02 -2.36911516e-02  7.24111278e-05
  -9.38035863e-03]]
Penalty value [0.001]
Training Accuracy Score 0.7916666666666666


In [649]:
X_train_log = sm.add_constant(X)
log_reg = sm.Logit(y, X_train_log).fit()
print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.422186
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:             gentrified   No. Observations:                  810
Model:                          Logit   Df Residuals:                      796
Method:                           MLE   Df Model:                           13
Date:                Sun, 22 May 2022   Pseudo R-squ.:                  0.1676
Time:                        22:47:03   Log-Likelihood:                -341.97
converged:                       True   LL-Null:                       -410.81
Covariance Type:            nonrobust   LLR p-value:                 6.137e-23
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     17.3563      2.774      6.256      0.000      11.919      

  x = pd.concat(x[::order], 1)


In [654]:
ols = ols.dropna()
X_ols = ols[['fire prevention', 'climate action', 'public transportation', 'housing',
       'green space', 'vehicle', 'building', 'utilities', 
       'white_rate_15', 'edu_2015', 'Median Gross Rent 2015', 'density','ownership_rate']]
y_ols = ols["gen_index"]

X_train_ols = sm.add_constant(X_ols)
ols_reg = sm.OLS(y_ols, X_train_ols).fit()
print(ols_reg.summary())

                            OLS Regression Results                            
Dep. Variable:              gen_index   R-squared:                       0.174
Model:                            OLS   Adj. R-squared:                  0.159
Method:                 Least Squares   F-statistic:                     11.43
Date:                Sun, 22 May 2022   Prob (F-statistic):           1.38e-22
Time:                        22:48:31   Log-Likelihood:                -791.85
No. Observations:                 719   AIC:                             1612.
Df Residuals:                     705   BIC:                             1676.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     -6

  x = pd.concat(x[::order], 1)


In [640]:
X_train_ols.describe()

Unnamed: 0,const,fire prevention,climate action,public transportation,housing,green space,vehicle,building,utilities,white_rate_15,edu_2015,Median Gross Rent 2015,density,ownership_rate
count,810.0,810.0,810.0,810.0,810.0,810.0,810.0,810.0,810.0,810.0,810.0,810.0,810.0,810.0
mean,1.0,0.636069,4.247769,9.260516,1.393141,1.17645,12.001585,1.588122,3.967327,0.55045,14.081633,7.38877,0.004924,0.464625
std,0.0,1.770376,2.876653,2.711475,3.964712,2.967801,1.479739,4.120545,4.268415,0.229191,1.816314,0.347377,0.005854,0.25317
min,1.0,0.0,-4.918769,-0.427043,0.0,0.0,5.007887,0.0,0.0,0.0,7.902527,5.708173,0.001003,0.0
25%,1.0,0.0,2.323757,7.377497,0.0,0.0,11.172963,0.0,0.0,0.371862,12.82442,7.195787,0.001939,0.260238
50%,1.0,0.0,4.669098,9.415667,0.0,0.0,12.186373,0.0,0.0,0.549011,14.216537,7.391513,0.003313,0.468498
75%,1.0,0.0,6.409201,11.099264,0.0,0.0,12.985256,0.0,8.228025,0.74281,15.479579,7.600172,0.006036,0.676736
max,1.0,9.025959,11.096165,17.44385,17.044617,14.859451,20.565433,16.303362,13.88184,1.0,17.725968,8.371011,0.105516,0.958333


In [641]:
y_ols.describe()

count    719.000000
mean      -0.083813
std        0.801507
min       -8.167628
25%       -0.430948
50%        0.020213
75%        0.374422
max        1.745021
Name: gen_index, dtype: float64

In [186]:
y_hat = log_reg.predict(X_test)
prediction = list(map(round, y_hat))
 
# comparing original and predicted values of y
print('Actual values', list(y_test.values))
print('Predictions :', prediction)

Actual values [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0

In [95]:
pre = pd.DataFrame(y_test.values,prediction).reset_index()

In [100]:
pre['test'] = pre['index'] - pre[0]

In [102]:
pre['test'].value_counts()

 0    453
-1    131
Name: test, dtype: int64

In [199]:
sf_gen_index = sf_demo_gen_pattern[sf_demo_gen_pattern.gentrified=='Gentrified']

In [200]:
sf_gen_index['edu_rate'] = (sf_gen_index['edu_2020']/sf_gen_index['edu_2015'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [201]:
sf_gen_index['income'] = (sf_gen_index['Median HH Income (in 2020 dollars)']/sf_gen_index['Median HH Income (in 2015 dollars)'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [202]:
sf_gen_index

Unnamed: 0,GEOID,area,popu_2015,edu_2015,Median HH Income (in 2015 dollars),Median Gross Rent 2015,white_15,popu_2020,edu_2020,Median HH Income (in 2020 dollars),Median Gross Rent 2020,white_20,gentrified,white_rate_20,white_rate_15,pattern,color,geometry,edu_rate,index
0,060750452002,187145.689267,1550,15.565567,66875.0,1414.80,802,1943.0,16.251908,130594.68,2510.0,1370.0,Gentrified,0.705095,0.517419,1.0,Pattern B,"POLYGON ((-13632869.903 4547990.267, -13632749...",1.044094,1.952818
3,060750479013,251690.139309,1392,14.104061,61333.0,1265.76,908,2185.0,13.583117,140441.04,2946.0,956.0,Gentrified,0.437529,0.652299,1.0,Pattern B,"POLYGON ((-13637188.876 4548004.773, -13637069...",0.963064,2.289812
6,060750476003,209279.793757,1032,14.755297,73848.0,2363.04,590,1447.0,15.070805,183891.60,4000.0,858.0,Gentrified,0.592951,0.571705,0.0,Pattern A,"POLYGON ((-13634192.267 4547914.073, -13634073...",1.021383,2.490136
19,060750477012,195058.411359,1471,13.194159,60795.0,1383.48,478,1851.0,15.084758,95715.00,1780.0,866.0,Gentrified,0.467855,0.324949,1.0,Pattern B,"POLYGON ((-13634580.772 4548438.428, -13634461...",1.143291,1.574389
30,060750476001,224204.644956,1661,14.262887,66689.0,1818.72,889,1340.0,15.595883,98550.00,2403.0,701.0,Gentrified,0.523134,0.535220,0.0,Pattern A,"POLYGON ((-13634328.299 4548451.667, -13634234...",1.093459,1.477755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,060750119012,49468.625106,1534,15.807799,55179.0,1730.16,913,364.0,16.492260,166418.28,2835.0,337.0,Gentrified,0.925824,0.595176,0.0,Pattern A,"POLYGON ((-13627142.626 4549853.318, -13627087...",1.043299,3.015971
2902,060750119022,75544.750673,763,14.375731,71750.0,1215.00,506,1378.0,15.959111,99000.36,1892.0,1020.0,Gentrified,0.740203,0.663172,0.0,Pattern A,"POLYGON ((-13626809.447 4550044.755, -13626741...",1.110143,1.379796
2907,060014059021,320632.620197,1614,9.570167,28203.0,1013.04,179,1559.0,11.791783,53925.48,1551.0,280.0,Gentrified,0.179602,0.110905,1.0,Pattern B,"POLYGON ((-13607903.613 4550322.267, -13607883...",1.232140,1.912048
2914,060750118001,90734.196966,1622,9.842311,20478.0,645.84,173,1590.0,12.220947,40137.12,888.0,197.0,Gentrified,0.123899,0.106658,1.0,Pattern B,"POLYGON ((-13626317.637 4550385.096, -13626264...",1.241675,1.960012


In [58]:
sf_invest

Unnamed: 0,GEOID,TotalPopu,area,year,research and planning,fire prevention,climate action,public transportation,housing,green space,others,vehicle,building,utilities,agriculture,clean air,Total Invest,Invest per land,Invest per land per popu,geometry
0,060750452002,1838,1.871457e+05,2015,0.0,0.000000,0.000000,9003.260968,0.0,0.0,0.0,24364.574128,0.0,0.0,0.0,0.0,33367.835096,178298.710627,0.097007,"POLYGON ((-13632869.903 4547990.267, -13632749..."
1,060750164001,2053,3.005107e+05,2015,0.0,0.000000,0.000000,491.241908,0.0,0.0,0.0,27214.619523,0.0,0.0,0.0,0.0,27705.861431,92195.917868,0.044908,"POLYGON ((-13629569.614 4547980.690, -13629386..."
2,060750477021,1124,1.884071e+05,2015,0.0,0.000000,0.000000,105.240863,0.0,0.0,0.0,1644.952632,0.0,0.0,0.0,0.0,1750.193494,9289.425139,0.008265,"POLYGON ((-13635264.942 4547852.104, -13635146..."
3,060750479013,2092,2.516901e+05,2015,0.0,0.000000,0.000000,235.339390,0.0,0.0,0.0,27731.604502,0.0,0.0,0.0,0.0,27966.943892,111116.565665,0.053115,"POLYGON ((-13637188.876 4548004.773, -13637069..."
4,060750451002,1354,1.759849e+05,2015,0.0,0.000000,0.000000,6239.788622,0.0,0.0,0.0,17948.657981,0.0,0.0,0.0,0.0,24188.446603,137446.186732,0.101511,"POLYGON ((-13632512.233 4548010.970, -13632392..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17515,060014058002,1268,1.931100e+05,2020,0.0,0.000000,19.148698,1166.024856,0.0,0.0,0.0,12210.840761,0.0,0.0,0.0,0.0,13396.014316,69369.866849,0.054708,"POLYGON ((-13607124.710 4550198.865, -13607109..."
17516,060014033001,1887,1.340623e+06,2020,0.0,0.000000,4470.664870,146935.292499,0.0,0.0,0.0,15488.288779,0.0,0.0,0.0,0.0,166894.246147,124490.069493,0.065972,"POLYGON ((-13611454.927 4550088.565, -13611443..."
17517,060750113002,1234,8.266409e+04,2020,0.0,100.577850,450.886421,6800.941166,0.0,0.0,0.0,9990.600568,0.0,0.0,0.0,0.0,17343.006005,209800.975747,0.170017,"POLYGON ((-13626545.286 4550476.240, -13626525..."
17518,060750112001,1192,7.836443e+04,2020,0.0,85.756580,385.032271,7145.750593,0.0,0.0,0.0,9650.563919,0.0,0.0,0.0,0.0,17267.103364,220343.644869,0.184852,"POLYGON ((-13627136.281 4550381.151, -13627117..."


In [531]:
import statsmodels.api as sm

X_train_ols = sm.add_constant(X_train)
results = sm.Logit(y_train,X_train_ols).fit() 
print(results.summary())

Optimization terminated successfully.
         Current function value: 0.406133
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:             gentrified   No. Observations:                  762
Model:                          Logit   Df Residuals:                      748
Method:                           MLE   Df Model:                           13
Date:                Sun, 22 May 2022   Pseudo R-squ.:                  0.1719
Time:                        16:27:09   Log-Likelihood:                -309.47
converged:                       True   LL-Null:                       -373.70
Covariance Type:            nonrobust   LLR p-value:                 4.248e-21
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     15.0275      2.713      5.539      0.000       9.710      

  x = pd.concat(x[::order], 1)


In [532]:
results.summary()

0,1,2,3
Dep. Variable:,gentrified,No. Observations:,762.0
Model:,Logit,Df Residuals:,748.0
Method:,MLE,Df Model:,13.0
Date:,"Sun, 22 May 2022",Pseudo R-squ.:,0.1719
Time:,16:27:13,Log-Likelihood:,-309.47
converged:,True,LL-Null:,-373.7
Covariance Type:,nonrobust,LLR p-value:,4.248e-21

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,15.0275,2.713,5.539,0.000,9.710,20.345
fire prevention,-0.2120,0.085,-2.508,0.012,-0.378,-0.046
climate action,-0.0078,0.056,-0.138,0.891,-0.118,0.103
public transportation,0.1414,0.058,2.427,0.015,0.027,0.256
housing,-0.0002,0.029,-0.008,0.994,-0.056,0.056
green space,-0.0377,0.036,-1.032,0.302,-0.109,0.034
vehicle,-0.1161,0.094,-1.236,0.216,-0.300,0.068
building,-0.0449,0.025,-1.812,0.070,-0.093,0.004
utilities,-0.0058,0.028,-0.208,0.835,-0.061,0.049
