In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

### Reading in Data

In [2]:
maindir = "/home/anerdi/Desktop/Zillow"
logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#proportion of living area
properties['N-LivingAreaProp'] = properties['calculatedfinishedsquarefeet']/properties['lotsizesquarefeet']

#Ratio of the built structure value to land area
properties['N-ValueProp'] = properties['structuretaxvaluedollarcnt']/properties['landtaxvaluedollarcnt']

#Ratio of tax of property over parcel
properties['N-ValueRatio'] = properties['taxvaluedollarcnt']/properties['taxamount']

# Pool
properties['Pool'] = (properties['pooltypeid2'].fillna(0) + properties['pooltypeid7'].fillna(0)).astype(int)

In [4]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')
data['wts_oct'] = np.where(data['month'] == 10, 1.5, 1)
data['wts_nov'] = np.where(data['month'] == 11, 1.5, 1)
data['wts_dec'] = np.where(data['month'] == 12, 1.5, 1)

### Feature Pipeline

In [5]:
# Setup variables considered in the model

# numerical variables
num_atts = ['calculatedfinishedsquarefeet','bathroomcnt','structuretaxvaluedollarcnt',
             'bedroomcnt','calculatedbathnbr','roomcnt','longitude','threequarterbathnbr', 'yardbuildingsqft17',
             'numberofstories','N-ValueRatio','N-ValueProp','N-LivingAreaProp']

# num_atts = ['calculatedfinishedsquarefeet','bathroomcnt','structuretaxvaluedollarcnt',
#              'bedroomcnt','calculatedbathnbr','N-ValueRatio','N-ValueProp','N-LivingAreaProp']

# categorical varaibles
cat_atts = ['airconditioningtypeid','heatingorsystemtypeid','Pool','propertylandusetypeid','taxdelinquencyflag']

# Dictionary of categorical variables and their default levels
cat_dict = {key:value for key,value in {'airconditioningtypeid':[-1] + list(range(1,14)),
           'architecturalstyletypeid':[-1] + list(range(1,28)),
           'buildingclasstypeid':[-1] + list(range(1,6)),
            'heatingorsystemtypeid':[-1] + list(range(1,26)),
            'pooltypeid10': list(range(-1,2)),
            'pooltypeid2': list(range(-1,2)),
            'pooltypeid7': list(range(-1,2)),
            'Pool': [0,1],
            'propertylandusetypeid': [-1, 31,46,47,246,247,248,260,261,262,263,264,265,266,267,268,269,270,271,
                                     273,274,275,276,279,290,291],
            'regionidcounty': [2061,3101,1286],
            'storytypeid':[-1] + list(range(1,36)),
            'typeconstructiontypeid':[-1] + list(range(1,19)),
            'yearbuilt': [-1] + list(range(1885,2018)),
            'fireplaceflag': [-1] + ['True','False'],
            'taxdelinquencyflag': [-1] + ['Y','N']
           }.items() if key in cat_atts}

# pairs to interact (x1,x2) where x1 is categorical and x2 is continuous
interact_pairs = [('regionidcounty','bathroomcnt'),('regionidcounty','bedroomcnt')
                 ,('regionidcounty','structuretaxvaluedollarcnt')]

In [6]:
# A custom transformer, which selects certain variables
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, desired_cols):
        self.desired_cols = desired_cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.desired_cols].values

# A custom transformer, which first selects the categorical variables
# from the DataFrame and then performs the dummification
class DF_Selector_GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self, cat_dict):
        self.cat_dict = cat_dict
        self.ndummies = sum(len(c) - 1  for c in cat_dict.values()) 
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.fillna(-1) # missing values are given -1 missing label
        foo = np.zeros((X.shape[0],self.ndummies))
        start = 0
        end = 0
        for c in sorted(self.cat_dict.keys()):
            end += len(self.cat_dict[c]) - 1
            foo[:, start:end] = pd.get_dummies(X[c].astype('category', categories=self.cat_dict[c]))[self.cat_dict[c][1:]]
            start += len(self.cat_dict[c]) - 1
        return foo

class Dummify_and_Interact(BaseEstimator, TransformerMixin):
    def __init__(self, interact_pairs, cat_dict):
        self.interact_pairs = interact_pairs
        self.cat_dict = cat_dict
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        features = None
        for pair in self.interact_pairs:
            x1,x2 = pair
            # impute x2 if missing
            imputer = Imputer()
            if np.isnan(X[x2]).any():
                x2vals = imputer.fit_transform(X[[x2]])
            else:
                x2vals = X[[x2]].as_matrix()
            # dummify x1 and multiply by x2vals
#             bar = ((pd.get_dummies(X[x1].astype('category', 
#                     categories=self.cat_dict[x1]))[self.cat_dict[x1][1:]]).as_matrix() * x2vals)
            bar = pd.get_dummies(X[x1].astype('category'), drop_first=True)
            if features is not None:
                features = np.concatenate((features,bar),axis=1)
            else:
                features = bar
        return features

In [7]:
# Categorical pipeline
cat_pipeline = Pipeline([
        ('select_and_dummify', DF_Selector_GetDummies(cat_dict)),
    ])

# Numerical pipeline
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_atts)),
        ('imputer', Imputer()),
    ])

# interaction pipeline
interact_pipeline = Pipeline([
        ('dummify_and_interact',Dummify_and_Interact(interact_pairs, cat_dict)),
    ])

# Full pipeline
feature_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
        ("interact_pipeline", interact_pipeline)
    ])

### Splitting data into the 10-Folds

In [8]:
indices = np.arange(data.shape[0])

In [9]:
np.random.seed(9)
np.random.shuffle(indices) # in-place shuffling 
indices

array([24283, 53623, 10038, ..., 66037, 55934, 86364])

In [10]:
fold_indices = {(i+1):indices[i::10] for i in range(10)}

In [11]:
fold_indices

{1: array([24283, 81578, 55185, ..., 88696, 19420, 22584]),
 2: array([53623, 89226, 14417, ..., 80106, 39259, 65462]),
 3: array([10038, 42930, 25710, ..., 25406, 50805, 66037]),
 4: array([ 6560, 52504, 69020, ..., 44509, 61176, 55934]),
 5: array([72110, 88930, 66005, ..., 46008, 10481, 86364]),
 6: array([85819, 15175, 52495, ..., 36792, 36810, 62760]),
 7: array([12329, 52468, 49812, ..., 18561, 36027, 88635]),
 8: array([64150, 73168, 74499, ..., 69304, 24814, 70209]),
 9: array([87434, 67401, 85581, ..., 71894, 82876,  5014]),
 10: array([31642, 81045, 14458, ...,  6418, 81792, 42747])}

### Training Models on the 10 splits of data \ fold_i for i = 1,...,10 & obtaining level 1 data

In [12]:
from sklearn.linear_model import ElasticNet, Lars, HuberRegressor
from sklearn.base import clone

import warnings
warnings.filterwarnings("ignore")

In [13]:
feature_pipeline.fit(properties) #fitting the pipeline to the entire properties dataframe

FeatureUnion(n_jobs=1,
       transformer_list=[('num_pipeline', Pipeline(memory=None,
     steps=[('selector', DataFrameSelector(desired_cols=['calculatedfinishedsquarefeet', 'bathroomcnt', 'structuretaxvaluedollarcnt', 'bedroomcnt', 'calculatedbathnbr', 'roomcnt', 'longitude', 'threequarterbathnbr', 'yardbuildingsqft17', 'numb...roomcnt'), ('regionidcounty', 'bedroomcnt'), ('regionidcounty', 'structuretaxvaluedollarcnt')]))]))],
       transformer_weights=None)

In [14]:
models = [
    ("ridge",ElasticNet(alpha=1.25, l1_ratio = 0, max_iter=1000)),
    ("enet", ElasticNet(alpha=1.25, l1_ratio = 0.5, max_iter=1000)),
    ("lasso", ElasticNet(alpha=1.25, l1_ratio = 1, max_iter=1000)),
    ("larm", Lars(n_nonzero_coefs = 1)),
    ("huber", HuberRegressor())
]

In [15]:
level_one_data = data[['parcelid']].copy()

for pair in models:
    current_model_name,current_model = pair
    print("Current model: %s" % current_model_name)
    
    # initialize an NoneObject to be a placeholder for level-one data for current model
    model_preds = None 
    
    for fold_nbr in range(1,11):
        print("...working on fold %d" % fold_nbr)

        # set training data X \ fold
        current_traindata = data.iloc[np.setdiff1d(indices,fold_indices[fold_nbr]),]

        # get a clone of the model and fit the current training data
        print('......training model')
        reg = clone(current_model)
        reg.fit(feature_pipeline.transform(current_traindata), current_traindata['logerror'])

        # level-one data (i.e., predict observations on current fold using reg)
        print('......obtaining level-one data')
        fold_data = data.iloc[fold_indices[fold_nbr]]
        fold_preds = Series(reg.predict(feature_pipeline.transform(fold_data)), 
                            index=fold_indices[fold_nbr], name = current_model_name)

        # adding to the placeholder for level-one data
        if model_preds is not None:
            model_preds = pd.concat([model_preds, fold_preds])
        else:
            model_preds = fold_preds

        # some housecleaning
        del reg
    
    # add level-one predictions of current model to running dataframe
    level_one_data = pd.concat([level_one_data, model_preds], axis=1)
    print("")
    
print("all done!")

Current model: ridge
...working on fold 1
......training model
......obtaining level-one data
...working on fold 2
......training model
......obtaining level-one data
...working on fold 3
......training model
......obtaining level-one data
...working on fold 4
......training model
......obtaining level-one data
...working on fold 5
......training model
......obtaining level-one data
...working on fold 6
......training model
......obtaining level-one data
...working on fold 7
......training model
......obtaining level-one data
...working on fold 8
......training model
......obtaining level-one data
...working on fold 9
......training model
......obtaining level-one data
...working on fold 10
......training model
......obtaining level-one data

Current model: enet
...working on fold 1
......training model
......obtaining level-one data
...working on fold 2
......training model
......obtaining level-one data
...working on fold 3
......training model
......obtaining level-one data
...worki

In [16]:
level_one_data.head()

Unnamed: 0,parcelid,ridge,enet,lasso,larm,huber
0,17073783,0.017901,0.010024,0.010388,0.010857,0.011198
1,17088994,0.010651,0.006399,0.007616,0.009479,0.011144
2,17100444,0.008782,0.00861,0.008853,0.010079,0.010893
3,17102429,0.014618,0.01237,0.01201,0.01132,0.011408
4,17109604,0.01805,0.016426,0.015855,0.012941,0.01118


In [17]:
level_one_data.to_csv("levelonedata/l1data_linear_models.csv", index=False)