In [7]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
import feature_pipelines as pipes

### Reading in Data

In [2]:
maindir = "/home/anerdi/Desktop/Zillow"
logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#proportion of living area
properties['N-LivingAreaProp'] = properties['calculatedfinishedsquarefeet']/properties['lotsizesquarefeet']

#Ratio of the built structure value to land area
properties['N-ValueProp'] = properties['structuretaxvaluedollarcnt']/properties['landtaxvaluedollarcnt']

#Ratio of tax of property over parcel
properties['N-ValueRatio'] = properties['taxvaluedollarcnt']/properties['taxamount']

# Pool
properties['Pool'] = (properties['pooltypeid2'].fillna(0) + properties['pooltypeid7'].fillna(0)).astype(int)

In [4]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')
data['wts_oct'] = np.where(data['month'] == 10, 1.5, 1)
data['wts_nov'] = np.where(data['month'] == 11, 1.5, 1)
data['wts_dec'] = np.where(data['month'] == 12, 1.5, 1)

### Feature Pipeline

In [8]:
# Setup variables considered in the model

# numerical variables
# num_atts = ['calculatedfinishedsquarefeet','bathroomcnt','structuretaxvaluedollarcnt',
#              'bedroomcnt','calculatedbathnbr','roomcnt','longitude','threequarterbathnbr', 'yardbuildingsqft17',
#              'numberofstories','N-ValueRatio','N-ValueProp','N-LivingAreaProp']

num_atts = ['calculatedfinishedsquarefeet','bathroomcnt','structuretaxvaluedollarcnt',
             'bedroomcnt','calculatedbathnbr','N-ValueRatio','N-ValueProp','N-LivingAreaProp']

# categorical varaibles
cat_atts = ['airconditioningtypeid','heatingorsystemtypeid','Pool','propertylandusetypeid','taxdelinquencyflag',
           'regionidcounty']

# Dictionary of categorical variables and their default levels
cat_dict = {key:value for key,value in {'airconditioningtypeid':[-1] + list(range(1,14)),
           'architecturalstyletypeid':[-1] + list(range(1,28)),
           'buildingclasstypeid':[-1] + list(range(1,6)),
            'heatingorsystemtypeid':[-1] + list(range(1,26)),
            'pooltypeid10': list(range(-1,2)),
            'pooltypeid2': list(range(-1,2)),
            'pooltypeid7': list(range(-1,2)),
            'Pool': [0,1],
            'propertylandusetypeid': [-1, 31,46,47,246,247,248,260,261,262,263,264,265,266,267,268,269,270,271,
                                     273,274,275,276,279,290,291],
            'regionidcounty': [2061,3101,1286],
            'storytypeid':[-1] + list(range(1,36)),
            'typeconstructiontypeid':[-1] + list(range(1,19)),
            'yearbuilt': [-1] + list(range(1885,2018)),
            'fireplaceflag': [-1] + ['True','False'],
            'taxdelinquencyflag': [-1] + ['Y','N']
           }.items() if key in cat_atts}

# pairs to interact (x1,x2) where x1 is categorical and x2 is continuous
interact_pairs = [('regionidcounty','bathroomcnt'),('regionidcounty','bedroomcnt')
                 ,('regionidcounty','structuretaxvaluedollarcnt')]

In [9]:
# Categorical pipeline
cat_pipeline = Pipeline([
        ('select_and_dummify', pipes.DF_Selector_GetDummies(cat_dict)),
    ])

# Numerical pipeline
num_pipeline = Pipeline([
        ('selector', pipes.DataFrameSelector(num_atts)),
        ('imputer', Imputer()),
    ])

# interaction pipeline
interact_pipeline = Pipeline([
        ('dummify_and_interact',pipes.Dummify_and_Interact(interact_pairs,cat_dict)),
    ])

# Full pipeline
feature_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
        ("interact_pipeline", interact_pipeline)
    ])

### Splitting data into the 10-Folds

In [10]:
indices = np.arange(data.shape[0])

In [11]:
np.random.seed(19)
np.random.shuffle(indices) # in-place shuffling 
indices

array([ 2199, 86155, 84691, ..., 86952, 82677, 76398])

In [12]:
fold_indices = {(i+1):indices[i::10] for i in range(10)}

In [13]:
fold_indices

{1: array([ 2199, 83721, 29492, ..., 37852, 48911, 39220]),
 2: array([86155, 32252, 81949, ..., 57319, 13479, 33811]),
 3: array([84691, 37597,  3215, ..., 84821, 43372, 86952]),
 4: array([11172, 67082, 58364, ..., 74500, 63830, 82677]),
 5: array([78769, 73075, 17232, ..., 12489,   266, 76398]),
 6: array([53035, 17238, 32604, ..., 14649, 26827, 61025]),
 7: array([58194, 72307,  3380, ..., 57397, 68361, 53125]),
 8: array([66378, 81551, 66156, ..., 73922, 85799, 45218]),
 9: array([70318, 70507, 20646, ...,  7537, 69584, 17218]),
 10: array([42552, 66817, 57336, ..., 88913, 67815, 17738])}

### Loading Stage 1 estimated probabilities

In [59]:
stacked_rfs_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_rfs.csv.gz")
stacked_rfs_probabilities.rename(columns={'stacked_pred':"overestimate_prob"}, inplace=True)
stacked_rfs_probabilities = pd.merge(data[['parcelid']], stacked_rfs_probabilities, on='parcelid')

stacked_annrfs_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_ann_rfs.csv.gz")
stacked_annrfs_probabilities.rename(columns={'stacked_pred':"overestimate_prob"}, inplace=True)
stacked_annrfs_probabilities = pd.merge(data[['parcelid']], stacked_annrfs_probabilities, on='parcelid')

logistic_probabiliies = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs.csv.gz")
logistic_probabiliies = pd.merge(data[['parcelid']], logistic_probabiliies, on='parcelid')

In [61]:
assert (stacked_rfs_probabilities.parcelid == data.parcelid).all()
assert (stacked_annrfs_probabilities.parcelid == data.parcelid).all()
assert (logistic_probabiliies.parcelid == data.parcelid).all()

In [63]:
stage1_models = [
    ('stacked_rfs', stacked_rfs_probabilities),
    ('stacked_annrfs', stacked_annrfs_probabilities),
    ('logistic', logistic_probabiliies)
]

### Training Models on the 10 splits of data \ fold_i for i = 1,...,10 & obtaining level 1 data

In [64]:
from sklearn.linear_model import ElasticNet, Lars, HuberRegressor
from sklearn.base import clone

import warnings
warnings.filterwarnings("ignore")

In [65]:
feature_pipeline.fit(properties) #fitting the pipeline to the entire properties dataframe

FeatureUnion(n_jobs=1,
       transformer_list=[('num_pipeline', Pipeline(memory=None,
     steps=[('selector', DataFrameSelector(desired_cols=['calculatedfinishedsquarefeet', 'bathroomcnt', 'structuretaxvaluedollarcnt', 'bedroomcnt', 'calculatedbathnbr', 'N-ValueRatio', 'N-ValueProp', 'N-LivingAreaProp'])), ('imputer', Imputer(...roomcnt'), ('regionidcounty', 'bedroomcnt'), ('regionidcounty', 'structuretaxvaluedollarcnt')]))]))],
       transformer_weights=None)

In [108]:
stage2_models = [
    ("ridge",ElasticNet(alpha=1.25, l1_ratio = 0, max_iter=1000)),
    ("enet", ElasticNet(alpha=1.25, l1_ratio = 0.5, max_iter=1000)),
    ("lasso", ElasticNet(alpha=1.25, l1_ratio = 1, max_iter=1000)),
    ("larm", Lars(n_nonzero_coefs = 1)),
    ("huber", HuberRegressor())
]

In [107]:
# split training data into over/under subsets
ix_overestimated = np.where(data['logerror'] >= 0)[0]
ix_underestimated = np.where(data['logerror'] < 0)[0]
data_indices = {"over": ix_overestimated, "under": ix_underestimated}

In [109]:
level_one_data = data[['parcelid']].copy()

for stage1_pair in stage1_models:
    stage1_name, stage1_probs = stage1_pair
    print("Stage 1: %s\t " % (stage1_name))
    
    for pair in stage2_models:
        current_model_name,current_model = pair
        print("...Current Model: %s" % (current_model_name))

        # initialize an NoneObject to be a placeholder for level-one data for current model
        model_preds = None 

        for fold_nbr in range(1,11):
            print("...working on fold %d" % fold_nbr)

            # set training data X \ fold
            fold_trainindices = np.setdiff1d(indices,fold_indices[fold_nbr])
            fold_traindata = data.iloc[fold_trainindices,]

            # training the over/under models on their respective training data
            fold_preds_dict = {'over': None, 'under':None}
            for key,val in data_indices.items():
                type_of_zestimate, ix = key, val

                # preprocess current training data
                current_traindata = data.iloc[np.intersect1d(ix, fold_trainindices),]

                # get a clone of the model and fit the current training data
                print('......training model')
                reg = clone(current_model)
                reg.fit(feature_pipeline.transform(current_traindata), current_traindata['logerror'])

                # level-one data (i.e., predict observations on current fold using reg)
                print('......obtaining level-one data')
                fold_data = data.iloc[fold_indices[fold_nbr]]
                fold_preds_overunder = Series(reg.predict(feature_pipeline.transform(fold_data)), 
                                    index=fold_indices[fold_nbr], name = current_model_name)
                fold_preds_dict[type_of_zestimate] = fold_preds_overunder

            # combine over/under fold preds to get a single prediction
            fold_stage1_overestimate_probs = stage1_probs.iloc[fold_indices[fold_nbr]]['overestimate_prob'] 
            fold_preds = (fold_preds_dict['over']*fold_stage1_overestimate_probs
                              + fold_preds_dict['under']*(1-fold_stage1_overestimate_probs))
            fold_preds.name = stage1_name + '_' + current_model_name

            # adding to the placeholder for level-one data
            if model_preds is not None:
                model_preds = pd.concat([model_preds, fold_preds])
            else:
                model_preds = fold_preds

            # some housecleaning
            del reg

        # add level-one predictions of current model to running dataframe
        level_one_data = pd.concat([level_one_data, model_preds], axis=1)
        print("")
    
print("all done!")

Stage 1: stacked_rfs	 
...Current Model: ridge
...working on fold 1
......training model
......obtaining level-one data
......training model
......obtaining level-one data
...working on fold 2
......training model
......obtaining level-one data
......training model
......obtaining level-one data
...working on fold 3
......training model
......obtaining level-one data
......training model
......obtaining level-one data
...working on fold 4
......training model
......obtaining level-one data
......training model
......obtaining level-one data
...working on fold 5
......training model
......obtaining level-one data
......training model
......obtaining level-one data
...working on fold 6
......training model
......obtaining level-one data
......training model
......obtaining level-one data
...working on fold 7
......training model
......obtaining level-one data
......training model
......obtaining level-one data
...working on fold 8
......training model
......obtaining level-one data
.....

In [110]:
level_one_data.head()

Unnamed: 0,parcelid,stacked_rfs_ridge,stacked_rfs_enet,stacked_rfs_lasso,stacked_rfs_larm,stacked_rfs_huber,stacked_annrfs_ridge,stacked_annrfs_enet,stacked_annrfs_lasso,stacked_annrfs_larm,stacked_annrfs_huber,logistic_ridge,logistic_enet,logistic_lasso,logistic_larm,logistic_huber
0,17073783,0.024176,0.025774,0.02602,0.027547,0.014576,0.01877,0.019923,0.02011,0.021019,0.011123,0.018028,0.01912,0.019298,0.020122,0.010649
1,17088994,0.004663,0.003873,0.003904,0.004764,0.000855,0.006679,0.005929,0.006016,0.007209,0.001368,0.009354,0.008658,0.008818,0.010454,0.002048
2,17100444,0.012671,0.011883,0.01194,0.012623,0.007172,0.016844,0.016133,0.016214,0.017172,0.009779,0.005232,0.004307,0.004321,0.004514,0.002524
3,17102429,0.012099,0.010563,0.010574,0.010785,0.008539,0.020774,0.018797,0.018773,0.019321,0.015419,0.012408,0.010856,0.010865,0.011089,0.008784
4,17109604,0.018363,0.017627,0.017683,0.015224,0.014375,0.016069,0.015316,0.015377,0.01267,0.012031,0.020961,0.020245,0.020294,0.018116,0.01703


In [111]:
level_one_data.to_csv("~/home/an/levelonedata/l1data_twostage_linear_models.csv", index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'levelonedata/l1data_twostage_linear_models.csv'