In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
url = 'https://raw.githubusercontent.com/wshenyc/MLForCities/main/Data/housing_data.csv'
df = pd.read_csv(url, index_col=0)
print(df.head())

               cd  county     tract10  res_units  other_units  year_built  \
bbl                                                                         
3047940001  309.0      47  47080600.0          4            3      1899.0   
3047940017  309.0      47  47080600.0          3            0      1991.0   
3047940078  309.0      47  47080600.0          3            0      2005.0   
2032790058  207.0       5   5040702.0          6            0      1915.0   
1011980126  107.0      61  61016900.0          7            0      1886.0   

            year_reno  buildings  floors building_class  ...  \
bbl                                                      ...   
3047940001     1899.0          2     3.0             S9  ...   
3047940017     1991.0          1     2.0             C0  ...   
3047940078     2005.0          1     3.0             C0  ...   
2032790058     2007.0          1     2.0             C2  ...   
1011980126     1984.0          1     4.0             C5  ...   

           

In [3]:
print(df.shape)

(158185, 30)


In [4]:
df['outcome']=df['outcome'].astype('int')

In [5]:
y=df.loc[:,'outcome']
print(y.head())
print(y.shape)

bbl
3047940001    0
3047940017    0
3047940078    0
2032790058    0
1011980126    0
Name: outcome, dtype: int64
(158185,)


In [6]:
##Using Naviebayes to get the probability

In [7]:
X=df.loc[:,"cd":"viol_bbl_ser_2020"]
X=pd.get_dummies(X)
X = X.fillna(0)

# Split data into 70% train, 30% test
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.3, random_state=999)
print(X_train.head())

               cd  county     tract10  res_units  other_units  year_built  \
bbl                                                                         
1005257502  102.0      61  61006500.0         11            3      1910.0   
3042520015  305.0      47  47120000.0          3            0      1910.0   
1010610012  104.0      61  61013300.0          5            0      1920.0   
2023760031  201.0       5   5006700.0          5            0      1927.0   
4100120020  412.0      81  81020600.0          3            1      1925.0   

            year_reno  buildings  floors  basement_code  ...  \
bbl                                                      ...   
1005257502     1910.0          2     4.0              5  ...   
3042520015     1910.0          1     2.0              2  ...   
1010610012     1982.0          1     3.0              5  ...   
2023760031     1927.0          1     2.0              5  ...   
4100120020     1925.0          1     2.5              5  ...   

           

In [8]:
# Training a binary Naive Bayes Classifier with discrete input attributes.
# Assume that the binary output variable takes on values 0 or 1. 
def trainNaiveBayesDiscrete(X,y):
    prior = 1.*y.sum()/y.count()
    nbc = {'prior':prior}
    X_1 = X[y==1]
    X_0 = X[y==0]
    for j in X.columns:
        nbc[j+'_1'] = X_1[j].value_counts(normalize=True)
        nbc[j+'_0'] = X_0[j].value_counts(normalize=True)
    return nbc

def testNaiveBayesDiscrete(X,nbc):
    y_pred = pd.Series(index=X.index,dtype='float64')
    for i in X.index:
        # compute odds of y=1
        y_pred[i] = nbc['prior']/(1-nbc['prior']) # prior odds
        for j in X.columns:
            thevalue = X.loc[i,j]
            if thevalue not in nbc[j+'_1']:
                y_pred[i] = y_pred[i]*1E-3
            if thevalue not in nbc[j+'_0']:
                y_pred[i] = y_pred[i]*1E3
            if (thevalue in nbc[j+'_1']) & (thevalue in nbc[j+'_0']):
                y_pred[i] = y_pred[i]*(nbc[j+'_1'][thevalue]+1E-3)/(nbc[j+'_0'][thevalue]+1E-3)
        # convert odds to probability of y=1
        y_pred[i] = y_pred[i]/(1.0+y_pred[i])
    return y_pred

In [9]:
naive_bayes_classifier = trainNaiveBayesDiscrete(X_train,y_train)
for i,j in naive_bayes_classifier.items():
    print(i)
    print(j)
    print()
#y_pred_train = testNaiveBayesDiscrete(X_train,naive_bayes_classifier)
#y_pred_test = testNaiveBayesDiscrete(X_test,naive_bayes_classifier)

prior
0.12594713218759312

cd_1
112.0    0.048114
303.0    0.038291
207.0    0.037860
304.0    0.037430
204.0    0.035637
205.0    0.034634
110.0    0.033128
317.0    0.032913
314.0    0.031909
308.0    0.031048
305.0    0.029973
301.0    0.027750
206.0    0.027678
212.0    0.025957
109.0    0.024595
103.0    0.023232
309.0    0.023089
107.0    0.022229
209.0    0.021512
111.0    0.021440
316.0    0.021081
203.0    0.020579
201.0    0.018643
401.0    0.018141
108.0    0.017711
405.0    0.017209
307.0    0.016492
312.0    0.016277
311.0    0.015990
202.0    0.014556
104.0    0.013552
211.0    0.013337
306.0    0.012620
302.0    0.011903
404.0    0.011831
310.0    0.011258
208.0    0.010899
102.0    0.010684
403.0    0.010684
412.0    0.009824
402.0    0.009752
106.0    0.009608
414.0    0.009465
407.0    0.008390
315.0    0.008318
313.0    0.007959
409.0    0.007386
501.0    0.006669
210.0    0.005736
408.0    0.005521
406.0    0.004589
318.0    0.004446
105.0    0.003155
413.0    0.002

In [10]:
y_pred_test = testNaiveBayesDiscrete(X_test,naive_bayes_classifier)

In [11]:
# measure accuracy for the binary prediction task
#print('In sample prediction accuracy:',1.0*sum((y_pred_train>0.5)==y_train)/len(y_train))
print('Out of sample prediction accuracy:',1.0*sum((y_pred_test>0.5)==y_test)/len(y_test))

Out of sample prediction accuracy: 0.9174182400539447


In [12]:
y_pred_all = testNaiveBayesDiscrete(X,naive_bayes_classifier)

In [14]:
## create a csv of probabilities at the building level and tract level
pred_prob_allbld= pd.concat([df.loc[:, "cd"], 
                            pd.DataFrame(y_pred_all,
                                        columns = ["prob_vacate_bl"])], axis = 1)
pred_prob_allbld

Unnamed: 0_level_0,cd,prob_vacate_bl
bbl,Unnamed: 1_level_1,Unnamed: 2_level_1
3047940001,309.0,6.595460e-08
3047940017,309.0,5.591637e-11
3047940078,309.0,3.117613e-10
2032790058,207.0,5.647479e-05
1011980126,107.0,2.242228e-08
...,...,...
1011980055,107.0,2.285197e-12
2039340058,209.0,9.560692e-13
3087140010,313.0,9.995171e-01
2027860012,204.0,9.999933e-01


In [15]:
#The probability of using NaiveBayes model is too low, we might use other model.