# Decision Tree & Random Forest

In [10]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [11]:
#loading in building-level file

url = 'https://raw.githubusercontent.com/wshenyc/MLForCities/main/Data/building_data_final.csv'
df = pd.read_csv(url)

In [5]:
#158,185 rows corresponding to a unique building based on Borough, Block, Lot (BBL)
#116 columns 
df.shape

(158185, 116)

In [4]:
#of the 158,185 buildings, about 12% of buildings incurred a Class C HPD violation in 2021

df['outcome'].mean()

0.125858962607074

### 1a) Decision Tree

In [12]:
X = df.loc[:, "bbl":"building_class_Z9"]
y = df.loc[:, 'outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=999)

dt = DecisionTreeClassifier(max_depth=10, random_state = 99)
dt.fit(X_train, y_train)
y_predict = dt.predict(X_test)

# accuracy is high
accuracy= (y_predict == y_test).value_counts(normalize=True)[True]
accuracy

0.9549266689143627

In [13]:
pred_prob_all_bld= pd.concat([df.loc[:, ["bbl"]], 
                            pd.DataFrame(dt.predict_proba(df.loc[:, "bbl":"building_class_Z9"])[:, 1],
                                        columns = ["prob_vio_dt"])], axis = 1)
pred_prob_all_bld

Unnamed: 0,bbl,prob_vio_dt
0,3047940001,0.060268
1,3047940017,0.001839
2,3047940078,0.001839
3,2032790058,0.157187
4,1011980126,0.011708
...,...,...
158180,1011980055,0.001839
158181,2039340058,0.001839
158182,3087140010,0.947761
158183,2027860012,0.800000


### 1b) Random Forests

In [14]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=30, n_jobs=-1, max_depth = 6, random_state = 99)
rf.fit(X_train, y_train)
pred=rf.predict_proba(X_test)[:,1]

#slightly higher accuracy than decision tree
print(roc_auc_score(y_test,pred))

0.9723532385313027


In [15]:
#adding the rf probabilities

pred_prob_all_bld = pd.concat([pred_prob_all_bld, 
                 pd.DataFrame(rf.predict_proba(df.loc[:, "bbl":"building_class_Z9"])[:, 1],
                              columns = ["prob_vio_rf"])], axis = 1)

pred_prob_all_bld

Unnamed: 0,bbl,prob_vio_dt,prob_vio_rf
0,3047940001,0.060268,0.171337
1,3047940017,0.001839,0.009545
2,3047940078,0.001839,0.010462
3,2032790058,0.157187,0.181260
4,1011980126,0.011708,0.057772
...,...,...,...
158180,1011980055,0.001839,0.010103
158181,2039340058,0.001839,0.012765
158182,3087140010,0.947761,0.741642
158183,2027860012,0.800000,0.413464


In [16]:
#writing csv for dt and rf's probabilities 

pred_prob_all_bld.to_csv("pred_prob_dt_rf.csv",index=False)