In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
url = 'https://raw.githubusercontent.com/wshenyc/MLForCities/main/Data/building_data_final.csv'
df = pd.read_csv(url, index_col=0)
print(df.head())

            county  res_units  other_units  buildings  basement_code  \
bbl                                                                    
3047940001      47          4            3          2              5   
3047940017      47          3            0          1              2   
3047940078      47          3            0          1              2   
2032790058       5          6            0          1              5   
1011980126      61          7            0          1              1   

            lot_area  comp_bbl_2017  comp_bbl_2018  comp_bbl_2019  \
bbl                                                                 
3047940001      4000            0.0            0.0            0.0   
3047940017      2000            0.0            0.0            0.0   
3047940078      1400            0.0            0.0            0.0   
2032790058      2703            0.0            0.0            1.0   
1011980126      1916            0.0            1.0            0.0   

           

In [3]:
print(df.shape)

(158185, 114)


In [4]:
y=df.loc[:,'outcome']
print(y.head())
print(y.shape)

bbl
3047940001    0
3047940017    0
3047940078    0
2032790058    0
1011980126    0
Name: outcome, dtype: int64
(158185,)


In [6]:
##Using Naive Bayes to get the probability

In [5]:
X=df.loc[:,"county":"building_class_Z9"]
X=pd.get_dummies(X)
X = X.fillna(0)

# Split data into 70% train, 30% test
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.3, random_state=999)
print(X_train.head())

            county  res_units  other_units  buildings  basement_code  \
bbl                                                                    
1005257502      61         11            3          2              5   
3042520015      47          3            0          1              2   
1010610012      61          5            0          1              5   
2023760031       5          5            0          1              5   
4100120020      81          3            1          1              5   

            lot_area  comp_bbl_2017  comp_bbl_2018  comp_bbl_2019  \
bbl                                                                 
1005257502      3750            0.0            0.0            0.0   
3042520015      1780            0.0            0.0            0.0   
1010610012      2008            0.0            0.0            0.0   
2023760031      2500            0.0            0.0            0.0   
4100120020      2695            0.0            0.0            0.0   

           

In [6]:
# Training a binary Naive Bayes Classifier with discrete input attributes.
# Assume that the binary output variable takes on values 0 or 1. 
def trainNaiveBayesDiscrete(X,y):
    prior = 1.*y.sum()/y.count()
    nbc = {'prior':prior}
    X_1 = X[y==1]
    X_0 = X[y==0]
    for j in X.columns:
        nbc[j+'_1'] = X_1[j].value_counts(normalize=True)
        nbc[j+'_0'] = X_0[j].value_counts(normalize=True)
    return nbc

def testNaiveBayesDiscrete(X,nbc):
    y_pred = pd.Series(index=X.index,dtype='float64')
    for i in X.index:
        # compute odds of y=1
        y_pred[i] = nbc['prior']/(1-nbc['prior']) # prior odds
        for j in X.columns:
            thevalue = X.loc[i,j]
            if thevalue not in nbc[j+'_1']:
                y_pred[i] = y_pred[i]*1E-3
            if thevalue not in nbc[j+'_0']:
                y_pred[i] = y_pred[i]*1E3
            if (thevalue in nbc[j+'_1']) & (thevalue in nbc[j+'_0']):
                y_pred[i] = y_pred[i]*(nbc[j+'_1'][thevalue]+1E-3)/(nbc[j+'_0'][thevalue]+1E-3)
        # convert odds to probability of y=1
        y_pred[i] = y_pred[i]/(1.0+y_pred[i])
    return y_pred

In [7]:
naive_bayes_classifier = trainNaiveBayesDiscrete(X_train,y_train)
for i,j in naive_bayes_classifier.items():
    print(i)
    print(j)
    print()
#y_pred_train = testNaiveBayesDiscrete(X_train,naive_bayes_classifier)
#y_pred_test = testNaiveBayesDiscrete(X_test,naive_bayes_classifier)

prior
0.12594713218759312

county_1
47    0.378747
5     0.265811
61    0.229815
81    0.117740
85    0.007888
Name: county, dtype: float64

county_0
47    0.472893
81    0.268363
61    0.126944
5     0.119143
85    0.012657
Name: county, dtype: float64

res_units_1
3      0.101104
6      0.092069
4      0.062455
8      0.059013
16     0.038004
         ...   
391    0.000072
159    0.000072
182    0.000072
221    0.000072
134    0.000072
Name: res_units, Length: 352, dtype: float64

res_units_0
3       0.536902
4       0.142194
6       0.083889
8       0.039459
5       0.034469
          ...   
411     0.000010
1166    0.000010
556     0.000010
324     0.000010
219     0.000010
Name: res_units, Length: 443, dtype: float64

other_units_1
0      0.727879
1      0.105765
2      0.067403
3      0.029399
4      0.019576
5      0.015488
6      0.011688
7      0.007529
8      0.004159
9      0.002079
10     0.001936
12     0.000932
11     0.000789
13     0.000574
14     0.000430
15     0.000

In [8]:
y_pred_test = testNaiveBayesDiscrete(X_test,naive_bayes_classifier)

In [9]:
# measure accuracy for the binary prediction task
#print('In sample prediction accuracy:',1.0*sum((y_pred_train>0.5)==y_train)/len(y_train))
print('Out of sample prediction accuracy:',1.0*sum((y_pred_test>0.5)==y_test)/len(y_test))

Out of sample prediction accuracy: 0.9195254551584626


In [10]:
y_pred_all = testNaiveBayesDiscrete(X,naive_bayes_classifier)

In [18]:
y_pred_all

bbl
3047940001    8.621235e-07
3047940017    2.594182e-10
3047940078    1.051866e-09
2032790058    1.743969e-05
1011980126    5.868199e-06
                  ...     
1011980055    8.845688e-10
2039340058    2.153485e-09
3087140010    9.977958e-01
2027860012    9.969637e-01
4007950036    2.194019e-09
Length: 158185, dtype: float64

In [24]:
## create a csv of probabilities at the building level and tract level
pred_prob_all_bld= pd.DataFrame(y_pred_all,
                                        columns = ["prob_vio_nb"],index=df.index)
pred_prob_all_bld

Unnamed: 0_level_0,prob_vio_nb
bbl,Unnamed: 1_level_1
3047940001,8.621235e-07
3047940017,2.594182e-10
3047940078,1.051866e-09
2032790058,1.743969e-05
1011980126,5.868199e-06
...,...
1011980055,8.845688e-10
2039340058,2.153485e-09
3087140010,9.977958e-01
2027860012,9.969637e-01


In [15]:
#The probability of using NaiveBayes model is too low, we might use other model.

In [25]:
pred_prob_all_bld.to_csv("pred_nb.csv")