In [7]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Reading in Data

In [2]:
maindir = "/home/anerdi/Desktop/Zillow"
logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#proportion of living area
properties['N-LivingAreaProp'] = properties['calculatedfinishedsquarefeet']/properties['lotsizesquarefeet']

#Ratio of the built structure value to land area
properties['N-ValueProp'] = properties['structuretaxvaluedollarcnt']/properties['landtaxvaluedollarcnt']

#Ratio of tax of property over parcel
properties['N-ValueRatio'] = properties['taxvaluedollarcnt']/properties['taxamount']

# Pool
properties['Pool'] = (properties['pooltypeid2'].fillna(0) + properties['pooltypeid7'].fillna(0)).astype(int)

In [4]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')
del logerror

### New response variable 

In [5]:
data['overestimation'] = (data['logerror'] >= 0).astype(int)

### Feature Pipeline

In [6]:
# Setup variables considered in the model


# numerical variables
num_atts = ['bathroomcnt','bedroomcnt','buildingqualitytypeid','calculatedbathnbr','finishedfloor1squarefeet',
           'calculatedfinishedsquarefeet','finishedsquarefeet12','finishedsquarefeet13',
           'finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6','fireplacecnt',
           'fullbathcnt','garagecarcnt','garagetotalsqft','latitude','longitude','lotsizesquarefeet',
           'poolcnt','poolsizesum','censustractandblock','roomcnt','threequarterbathnbr','unitcnt',
           'yardbuildingsqft17','yardbuildingsqft26','numberofstories',
            'structuretaxvaluedollarcnt','taxvaluedollarcnt','landtaxvaluedollarcnt','taxamount',
           'N-ValueRatio', 'N-LivingAreaProp', 'N-ValueProp']

# categorical varaibles
cat_atts = ['airconditioningtypeid','architecturalstyletypeid',
           'buildingclasstypeid','heatingorsystemtypeid','pooltypeid10','pooltypeid2',
            'pooltypeid7','propertylandusetypeid','regionidcounty',
           'storytypeid','typeconstructiontypeid','yearbuilt','fireplaceflag',
           'taxdelinquencyflag']

# Dictionary of categorical variables and their default levels
cat_dict = {key:value for key,value in {'airconditioningtypeid':[-1] + list(range(1,14)),
           'architecturalstyletypeid':[-1] + list(range(1,28)),
           'buildingclasstypeid':[-1] + list(range(1,6)),
            'heatingorsystemtypeid':[-1] + list(range(1,26)),
            'pooltypeid10': list(range(-1,2)),
            'pooltypeid2': list(range(-1,2)),
            'pooltypeid7': list(range(-1,2)),
            'Pool': [0,1],
            'propertylandusetypeid': [-1, 31,46,47,246,247,248,260,261,262,263,264,265,266,267,268,269,270,271,
                                     273,274,275,276,279,290,291],
            'regionidcounty': [2061,3101,1286],
            'storytypeid':[-1] + list(range(1,36)),
            'typeconstructiontypeid':[-1] + list(range(1,19)),
            'yearbuilt': [-1] + list(range(1885,2018)),
            'fireplaceflag': [-1] + ['True','False'],
            'taxdelinquencyflag': [-1] + ['Y','N']
           }.items() if key in cat_atts}

In [8]:
# Categorical pipeline
cat_pipeline = Pipeline([
        ('select_and_dummify', pipes.DF_Selector_GetDummies(cat_dict)),
    ])

# Numerical pipeline
num_pipeline = Pipeline([
        ('selector', pipes.DataFrameSelector(num_atts)),
        ('imputer', Imputer()),
        ('scaler', StandardScaler())
    ])

# Full pipeline
feature_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline)
    ])

### Splitting data into the 10-Folds

In [9]:
indices = np.arange(data.shape[0])

In [10]:
np.random.seed(19)
np.random.shuffle(indices) # in-place shuffling 
indices

array([ 2199, 86155, 84691, ..., 86952, 82677, 76398])

In [11]:
fold_indices = {(i+1):indices[i::10] for i in range(10)}

In [12]:
fold_indices

{1: array([ 2199, 83721, 29492, ..., 37852, 48911, 39220]),
 2: array([86155, 32252, 81949, ..., 57319, 13479, 33811]),
 3: array([84691, 37597,  3215, ..., 84821, 43372, 86952]),
 4: array([11172, 67082, 58364, ..., 74500, 63830, 82677]),
 5: array([78769, 73075, 17232, ..., 12489,   266, 76398]),
 6: array([53035, 17238, 32604, ..., 14649, 26827, 61025]),
 7: array([58194, 72307,  3380, ..., 57397, 68361, 53125]),
 8: array([66378, 81551, 66156, ..., 73922, 85799, 45218]),
 9: array([70318, 70507, 20646, ...,  7537, 69584, 17218]),
 10: array([42552, 66817, 57336, ..., 88913, 67815, 17738])}

### Training Models on the 10 splits of data \ fold_i for i = 1,...,10 & obtaining level 1 data

In [80]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone

import warnings
warnings.filterwarnings("ignore")

In [81]:
feature_pipeline.fit(data)

FeatureUnion(n_jobs=1,
       transformer_list=[('num_pipeline', Pipeline(memory=None,
     steps=[('selector', DataFrameSelector(desired_cols=['bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet13', 'fi...22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35], 'regionidcounty': [2061, 3101, 1286]}))]))],
       transformer_weights=None)

In [84]:
models = [
    ("network", MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(3, 50), random_state=1)),
    ("logistic", LogisticRegression(penalty='l1', tol=0.01)),
    ("rf_1",RandomForestClassifier(max_depth=8, random_state=9,class_weight={0:0.52,1:0.48})),
    ("rf_2", RandomForestClassifier(max_depth=12, random_state=9)),
    ("rf_3", RandomForestClassifier(max_depth=12, random_state=9, class_weight={0:0.55,1:0.45})),
]

In [85]:
level_one_data = data[['parcelid']].copy()

for pair in models:
    current_model_name,current_model = pair
    print("Current model: %s" % current_model_name)
    
    # initialize an NoneObject to be a placeholder for level-one data for current model
    model_preds = None 
    
    for fold_nbr in range(1,11):
        print("...working on fold %d" % fold_nbr)

        # set training data X \ fold
        current_traindata = data.iloc[np.setdiff1d(indices,fold_indices[fold_nbr]),]

        # get a clone of the model and fit the current training data
        print('......training model')
        clf = clone(current_model)
        clf.fit(feature_pipeline.transform(current_traindata), current_traindata['overestimation'])

        # level-one data (i.e., predict observations on current fold using reg)
        print('......obtaining level-one data')
        fold_data = data.iloc[fold_indices[fold_nbr]]
        fold_preds = Series(clf.predict_proba(feature_pipeline.transform(fold_data))[:,1], 
                            index=fold_indices[fold_nbr], name = current_model_name)

        # adding to the placeholder for level-one data
        if model_preds is not None:
            model_preds = pd.concat([model_preds, fold_preds])
        else:
            model_preds = fold_preds

        # some housecleaning
        del clf
    
    # add level-one predictions of current model to running dataframe
    level_one_data = pd.concat([level_one_data, model_preds], axis=1)
    print("")
    
print("all done!")

Current model: network
...working on fold 1
......training model
......obtaining level-one data
...working on fold 2
......training model
......obtaining level-one data
...working on fold 3
......training model
......obtaining level-one data
...working on fold 4
......training model
......obtaining level-one data
...working on fold 5
......training model
......obtaining level-one data
...working on fold 6
......training model
......obtaining level-one data
...working on fold 7
......training model
......obtaining level-one data
...working on fold 8
......training model
......obtaining level-one data
...working on fold 9
......training model
......obtaining level-one data
...working on fold 10
......training model
......obtaining level-one data

Current model: logistic
...working on fold 1
......training model
......obtaining level-one data
...working on fold 2
......training model
......obtaining level-one data
...working on fold 3
......training model
......obtaining level-one data
..

In [86]:
level_one_data.head()

Unnamed: 0,parcelid,network,logistic,rf_1,rf_2,rf_3
0,17073783,0.540814,0.58654,0.606695,0.725421,0.570743
1,17088994,0.575728,0.602209,0.520296,0.538926,0.501637
2,17100444,0.61868,0.517599,0.527499,0.53782,0.490139
3,17102429,0.667181,0.589822,0.535683,0.560426,0.524229
4,17109604,0.561772,0.588489,0.619031,0.568314,0.526031


In [25]:
level_one_data.to_csv("/home/anerdi/Desktop/Zillow/levelonedata/stage1_l1data_logistic_rf.csv", index=False)

### The Stacked Model

In [87]:
training_data = pd.merge(level_one_data, data[['overestimation','parcelid']], on='parcelid')
training_data.head()

Unnamed: 0,parcelid,network,logistic,rf_1,rf_2,rf_3,overestimation
0,17073783,0.540814,0.58654,0.606695,0.725421,0.570743,1
1,17088994,0.575728,0.602209,0.520296,0.538926,0.501637,1
2,17100444,0.61868,0.517599,0.527499,0.53782,0.490139,1
3,17102429,0.667181,0.589822,0.535683,0.560426,0.524229,0
4,17109604,0.561772,0.588489,0.619031,0.568314,0.526031,1


In [88]:
stacked = LogisticRegression(penalty='l1', tol=0.01)

In [89]:
stacked.fit(training_data[['network','rf_2','rf_3']], training_data['overestimation'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.01,
          verbose=0, warm_start=False)

In [90]:
stacked.coef_[0]

array([ 2.28956589,  1.07382395,  1.18851069])

In [91]:
stacked.intercept_

array([-2.24765693])

In [92]:
stacked.score(training_data[['network','rf_2','rf_3']], training_data['overestimation'])

0.57992643078860451

In [93]:
pred = stacked.predict(training_data[['network','rf_2','rf_3']])
true = training_data['overestimation']

In [94]:
print(recall_score(true,pred))
print(precision_score(true,pred))
print(f1_score(true,pred))

0.875475641253
0.583378220371
0.700184487299


In [69]:
print(recall_score(true,pred))
print(precision_score(true,pred))
print(f1_score(true,pred))

0.873228051497
0.582186468972
0.698607233553


In [95]:
confusion_matrix(true, pred, labels=[0,1])

array([[ 8094, 31712],
       [ 6316, 44405]])

In [70]:
confusion_matrix(true, pred, labels=[0,1])

array([[ 8020, 31786],
       [ 6430, 44291]])

### Training on full dataset

In [96]:
clf_ann = clone(models[0][1])
clf_rf2 = clone(models[3][1])
clf_rf3 = clone(models[4][1])

In [97]:
clf_ann.fit(feature_pipeline.transform(data), data['overestimation'])
clf_rf2.fit(feature_pipeline.transform(data), data['overestimation'])
clf_rf3.fit(feature_pipeline.transform(data), data['overestimation'])

RandomForestClassifier(bootstrap=True, class_weight={0: 0.55, 1: 0.45},
            criterion='gini', max_depth=12, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=9,
            verbose=0, warm_start=False)

In [None]:
models = [('ann',clf_ann),('rf2',clf_rf2), ('rf3',clf_rf3)]
overestimate_probs = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])
for pair in models:
    model_name, model = pair
    probabilities = None
    for i in range(int(properties.shape[0] / 100000)):   
        # get current test features
        current_test_feats = feature_pipeline.transform(properties.iloc[i*100000:(i+1)*100000])

        # predict on current test obs
        current_probs = Series(model.predict_proba(current_test_feats)[:,1], name='%s_overestimate_prob'%model_name,
                              index = np.arange(i*100000,(i+1)*100000))

        if probabilities is not None:
            probabilities = pd.concat([probabilities, current_probs])
        else:
            probabilities = current_probs

    #  fencepost problem
    current_test_feats = feature_pipeline.transform(properties.iloc[2900000:])
    current_probs = Series(model.predict_proba(current_test_feats)[:,1], name='%s_overestimate_prob'%model_name,
                          index = np.arange(2900000,2985217))
    probabilities = pd.concat([probabilities, current_probs])
    overestimate_probs = pd.concat([overestimate_probs, probabilities], axis=1)

In [76]:
overestimate_probs['stacked_pred'] = 1 / (1 + np.exp(-stacked.intercept_[0]
        - overestimate_probs['rf2_overestimate_prob']*stacked.coef_[0][0]
                                            - overestimate_probs['rf3_overestimate_prob']*stacked.coef_[0][1]))

In [77]:
overestimate_probs.head()

Unnamed: 0,parcelid,rf2_overestimate_prob,rf3_overestimate_prob,stacked_pred
0,10754147,0.466012,0.495383,0.492836
1,10759547,0.555562,0.524675,0.564805
2,10843547,0.548264,0.548752,0.574198
3,10859147,0.663067,0.543329,0.638632
4,10879947,0.519636,0.485341,0.520014


In [78]:
overestimate_probs.to_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_rfs.csv.gz", 
                          index=False, compression="gzip")