In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('/projects/18_property_vtmfid/shared/structured_data.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,Address_lookup,GT_Bldg_Class,GT_Style,GT_story,GT_wall,GT_cond,GT_frontage,GT_grade,GT_proximity,GT_basement
0,0,"193 MANHATTAN STREET, Staten Island, New York,...",A2,9.0,1.0,4.0,4,24,C,Detached,3
1,1,"189 MANHATTAN STREET, Staten Island, New York,...",A2,9.0,1.0,4.0,4,24,C,Detached,3
2,2,"181 MANHATTAN STREET, Staten Island, New York,...",A3,4.0,2.0,4.0,2,24,B,Detached,3
3,3,"179 MANHATTAN STREET, Staten Island, New York,...",B2,4.0,2.0,3.0,3,21,B,Detached,3
4,4,"175 MANHATTAN STREET, Staten Island, New York,...",B2,4.0,2.0,3.0,3,30,B,Detached,3


In [None]:
df.GT_proximity.value_counts()

Prepare the dataframe for Random Forest Classifier

In [3]:
df['GT_basement'].loc[ df['GT_basement'] == 'N'] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [4]:
# dropping the one entry with '.' as its value for frontage
df.dropna(inplace=True)
print(df.shape)
df.drop(labels=2754, axis=0, inplace=True)
print(df.shape)

(2774, 11)
(2773, 11)


In [5]:
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler

class DummyEncoder(TransformerMixin):
    '''
    Custom scikit-learn transformer for data that converts categorical features with 
    k classes to (k-1) dummy features.
    '''
    def fit(self, X=None, Y=None):
        # Do nothing
        return self
    
    def transform(self, X):
        categories = ['GT_Bldg_Class', 'GT_Style', 'GT_wall','GT_cond', 
                      'GT_grade', 'GT_basement']
        categories = [c for c in categories if c in X.columns.values]
        X = pd.get_dummies(X, drop_first=True, columns=categories)
        return X
    
class CustomScaler(TransformerMixin):
    '''
    Custom scikit-learn transformer for data that standardizes numerical features 
    to be zero mean and unit variance.
    '''
    def fit(self, X=None, Y=None):
        # Do nothing
        # Fit is implemented in transform method instead
        return self
    
    def transform(self, X):                
        numerical = ['GT_story','GT_frontage']
        numerical = [c for c in numerical if c in X.columns.values]
        scaler = StandardScaler().fit(X.loc[:,numerical])
        X.loc[:,numerical] = scaler.transform(X.loc[:,numerical])
        return X

In [None]:
enc = OneHotEncoder()
df['GT_Bldg_Class'] = enc.fit_transform(df['GT_Bldg_Class'])

In [6]:
scaler = CustomScaler()
df = scaler.fit(df).transform(df)
# Encode categorical features
encoder = DummyEncoder()
df = encoder.fit(df).transform(df)
display(df)

Unnamed: 0.1,Unnamed: 0,Address_lookup,GT_story,GT_frontage,GT_proximity,GT_Bldg_Class_A1,GT_Bldg_Class_A2,GT_Bldg_Class_A3,GT_Bldg_Class_A5,GT_Bldg_Class_A6,...,GT_grade_B+,GT_grade_B-,GT_grade_C,GT_grade_C+,GT_grade_C-,GT_grade_D,GT_grade_E,GT_grade_X,GT_basement_2,GT_basement_3
0,0,"193 MANHATTAN STREET, Staten Island, New York,...",-2.220044,-0.325663,Detached,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,1,"189 MANHATTAN STREET, Staten Island, New York,...",-2.220044,-0.325663,Detached,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,2,"181 MANHATTAN STREET, Staten Island, New York,...",-0.467589,-0.325663,Detached,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3,"179 MANHATTAN STREET, Staten Island, New York,...",-0.467589,-0.556764,Detached,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,4,"175 MANHATTAN STREET, Staten Island, New York,...",-0.467589,0.136538,Detached,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,5,"171 MANHATTAN STREET, Staten Island, New York,...",-0.467589,0.136538,Detached,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,6,"167 MANHATTAN STREET, Staten Island, New York,...",-0.467589,-0.633798,Detached,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
7,7,"165 MANHATTAN STREET, Staten Island, New York,...",-0.467589,-0.633798,Detached,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,8,"155 MANHATTAN STREET, Staten Island, New York,...",-0.467589,0.521706,Detached,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,9,"149 MANHATTAN STREET, Staten Island, New York,...",-0.467589,-0.787865,Detached,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [7]:
df.columns

Index([u'Unnamed: 0', u'Address_lookup', u'GT_story', u'GT_frontage',
       u'GT_proximity', u'GT_Bldg_Class_A1', u'GT_Bldg_Class_A2',
       u'GT_Bldg_Class_A3', u'GT_Bldg_Class_A5', u'GT_Bldg_Class_A6',
       u'GT_Bldg_Class_A9', u'GT_Bldg_Class_B1', u'GT_Bldg_Class_B2',
       u'GT_Bldg_Class_B3', u'GT_Bldg_Class_B9', u'GT_Bldg_Class_C0',
       u'GT_Bldg_Class_S1(OFFICE)', u'GT_Style_2.0', u'GT_Style_3.0',
       u'GT_Style_4.0', u'GT_Style_6.0', u'GT_Style_7.0', u'GT_Style_8.0',
       u'GT_Style_9.0', u'GT_Style_10.0', u'GT_Style_11.0', u'GT_Style_13.0',
       u'GT_Style_14.0', u'GT_wall_2.0', u'GT_wall_3.0', u'GT_wall_4.0',
       u'GT_wall_5.0', u'GT_wall_6.0', u'GT_wall_7.0', u'GT_wall_8.0',
       u'GT_wall_9.0', u'GT_cond_1', u'GT_cond_2', u'GT_cond_3', u'GT_cond_4',
       u'GT_cond_5', u'GT_cond_6', u'GT_cond_N', u'GT_grade_A', u'GT_grade_A+',
       u'GT_grade_A-', u'GT_grade_B', u'GT_grade_B+', u'GT_grade_B-',
       u'GT_grade_C', u'GT_grade_C+', u'GT_grade_C-', u'GT

In [8]:
list(df.columns.values)

['Unnamed: 0',
 'Address_lookup',
 'GT_story',
 'GT_frontage',
 'GT_proximity',
 'GT_Bldg_Class_A1',
 'GT_Bldg_Class_A2',
 'GT_Bldg_Class_A3',
 'GT_Bldg_Class_A5',
 'GT_Bldg_Class_A6',
 'GT_Bldg_Class_A9',
 'GT_Bldg_Class_B1',
 'GT_Bldg_Class_B2',
 'GT_Bldg_Class_B3',
 'GT_Bldg_Class_B9',
 'GT_Bldg_Class_C0',
 'GT_Bldg_Class_S1(OFFICE)',
 'GT_Style_2.0',
 'GT_Style_3.0',
 'GT_Style_4.0',
 'GT_Style_6.0',
 'GT_Style_7.0',
 'GT_Style_8.0',
 'GT_Style_9.0',
 'GT_Style_10.0',
 'GT_Style_11.0',
 'GT_Style_13.0',
 'GT_Style_14.0',
 'GT_wall_2.0',
 'GT_wall_3.0',
 'GT_wall_4.0',
 'GT_wall_5.0',
 'GT_wall_6.0',
 'GT_wall_7.0',
 'GT_wall_8.0',
 'GT_wall_9.0',
 'GT_cond_1',
 'GT_cond_2',
 'GT_cond_3',
 'GT_cond_4',
 'GT_cond_5',
 'GT_cond_6',
 'GT_cond_N',
 'GT_grade_A',
 'GT_grade_A+',
 'GT_grade_A-',
 'GT_grade_B',
 'GT_grade_B+',
 'GT_grade_B-',
 'GT_grade_C',
 'GT_grade_C+',
 'GT_grade_C-',
 'GT_grade_D',
 'GT_grade_E',
 'GT_grade_X',
 'GT_basement_2',
 'GT_basement_3']

In [10]:
df = df[['Unnamed: 0',
 'Address_lookup',
 'GT_story',
 'GT_frontage',
 'GT_Bldg_Class_A1',
 'GT_Bldg_Class_A2',
 'GT_Bldg_Class_A3',
 'GT_Bldg_Class_A5',
 'GT_Bldg_Class_A6',
 'GT_Bldg_Class_A9',
 'GT_Bldg_Class_B1',
 'GT_Bldg_Class_B2',
 'GT_Bldg_Class_B3',
 'GT_Bldg_Class_B9',
 'GT_Bldg_Class_C0',
 'GT_Bldg_Class_S1(OFFICE)',
 'GT_Style_2.0',
 'GT_Style_3.0',
 'GT_Style_4.0',
 'GT_Style_6.0',
 'GT_Style_7.0',
 'GT_Style_8.0',
 'GT_Style_9.0',
 'GT_Style_10.0',
 'GT_Style_11.0',
 'GT_Style_13.0',
 'GT_Style_14.0',
 'GT_wall_2.0',
 'GT_wall_3.0',
 'GT_wall_4.0',
 'GT_wall_5.0',
 'GT_wall_6.0',
 'GT_wall_7.0',
 'GT_wall_8.0',
 'GT_wall_9.0',
 'GT_cond_1',
 'GT_cond_2',
 'GT_cond_3',
 'GT_cond_4',
 'GT_cond_5',
 'GT_cond_6',
 'GT_cond_N',
 'GT_grade_A',
 'GT_grade_A+',
 'GT_grade_A-',
 'GT_grade_B',
 'GT_grade_B+',
 'GT_grade_B-',
 'GT_grade_C',
 'GT_grade_C+',
 'GT_grade_C-',
 'GT_grade_D',
 'GT_grade_E',
 'GT_grade_X',
 'GT_basement_2',
 'GT_basement_3',
          'GT_proximity']]

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,Address_lookup,GT_story,GT_frontage,GT_Bldg_Class_A1,GT_Bldg_Class_A2,GT_Bldg_Class_A3,GT_Bldg_Class_A5,GT_Bldg_Class_A6,GT_Bldg_Class_A9,...,GT_grade_B-,GT_grade_C,GT_grade_C+,GT_grade_C-,GT_grade_D,GT_grade_E,GT_grade_X,GT_basement_2,GT_basement_3,GT_proximity
0,0,"193 MANHATTAN STREET, Staten Island, New York,...",-2.220044,-0.325663,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,Detached
1,1,"189 MANHATTAN STREET, Staten Island, New York,...",-2.220044,-0.325663,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,Detached
2,2,"181 MANHATTAN STREET, Staten Island, New York,...",-0.467589,-0.325663,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,Detached
3,3,"179 MANHATTAN STREET, Staten Island, New York,...",-0.467589,-0.556764,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Detached
4,4,"175 MANHATTAN STREET, Staten Island, New York,...",-0.467589,0.136538,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Detached


In [None]:
df.dropna(axis=0, how='any').shape

In [None]:
df_clean = df.dropna(axis=0, how='any')

In [None]:
# use all the other codes to predict the ground truth proximity
# 2, 8 are excluded for testing 
X = df_clean.iloc[:, [2, 3, 4, 5, 6, 7, 8, 10]]

y = df_clean.iloc[:, 9]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X.isnull().any()

In [None]:
X.dropna(axis=0, how='any').shape

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,2:-1], df.iloc[:, -1], test_size = 0.3, random_state = 999)

In [None]:
OS=[]
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,2:-2], df.iloc[:,-2:], test_size = 0.3, random_state = i)    
    dt = DecisionTreeClassifier()
    dt.fit(X_train,y_train)
    OS.append(dt.score(X_test,y_test))
print (np.mean(OS))

In [13]:
param_grid ={'max_depth':range(1,11)}
rf = RandomForestClassifier(n_estimators=100)
gr = GridSearchCV(rf, param_grid=param_grid)
rs = gr.fit(X_train, y_train)
print ('Best parameter value:',rs.best_params_)
y_predict = rs.predict(X_test)
print('Accuracy = ',(pd.Series((y_predict == y_test)).value_counts(normalize=True)[True]))

('Best parameter value:', {'max_depth': 8})
('Accuracy = ', 0.92788461538461542)


In [None]:
print(y_predict)