In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

### Submission Functions 

In [2]:
def generate_submissions(oct_model,nov_model,dec_model,name='new_submission',logy=True):
    """
    This function creates the submission file for the public leaderboard predictions.
    Three already fitted models, one for each of the predicting time points, is required.
    """
    submission_df = DataFrame()
    for i in range(int(properties.shape[0] / 100000)):
        all_feats = full_pipeline.transform(properties.iloc[i*100000:(i+1)*100000])
        foo = properties.iloc[i*100000:(i+1)*100000][['parcelid']].reset_index(drop=True)
        if logy:
            foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                            '201611': nov_model.predict(all_feats),
                                                            '201612': dec_model.predict(all_feats)})], axis=1)
        else:
            foo = pd.concat([foo, DataFrame({'201610': np.log(oct_model.predict(all_feats)),
                                                            '201611': np.log(nov_model.predict(all_feats)),
                                                            '201612': np.log(dec_model.predict(all_feats))})], axis=1)
        submission_df = pd.concat([submission_df, foo], ignore_index=True)

    #  fencepost problem
    all_feats = full_pipeline.transform(properties.iloc[2900000:])
    foo = properties.iloc[2900000:][['parcelid']].reset_index(drop=True)
    foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                    '201611': nov_model.predict(all_feats),
                                                    '201612': dec_model.predict(all_feats)})], axis=1)
    submission_df = pd.concat([submission_df, foo], ignore_index=True)
    
    submission_df['201710'] = 0
    submission_df['201711'] = 0
    submission_df['201712'] = 0
    
    submission_df.rename(columns={'parcelid':'ParcelId'}, inplace=True)    
#     submission_df[['201610','201611','201612','201710','201711','201712']]= submission_df[['201610','201611','201612',
#                                                                                            '201710','201711','201712']].round(4)
    # unit test
    submission_df.drop_duplicates(inplace=True)
    assert submission_df.shape[0] == properties.shape[0]
    # write to .csv
    submission_df[['ParcelId','201610','201611','201612',
                  '201710','201711','201712']].to_csv(name + ".gz", index=False, float_format='%.4g', compression='gzip')
    return submission_df

In [3]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [5]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv")
# test_parcels = pd.read_csv(maindir + "/data/sample_submission.csv", usecols = ['ParcelId'])
# test_parcels.rename(columns={'ParcelId':'parcelid'}, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


### Additional Features

In [6]:
#proportion of living area
properties['N-LivingAreaProp'] = properties['calculatedfinishedsquarefeet']/properties['lotsizesquarefeet']

#Ratio of the built structure value to land area
properties['N-ValueProp'] = properties['structuretaxvaluedollarcnt']/properties['landtaxvaluedollarcnt']

#Ratio of tax of property over parcel
properties['N-ValueRatio'] = properties['taxvaluedollarcnt']/properties['taxamount']

In [7]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

### Data  Preprocessing Pipeline

In [8]:
num_atts = ['calculatedfinishedsquarefeet','bathroomcnt','structuretaxvaluedollarcnt',
             'bedroomcnt','calculatedbathnbr','roomcnt','longitude','threequarterbathnbr', 'yardbuildingsqft17',
             'numberofstories','N-ValueRatio','N-ValueProp','N-LivingAreaProp']

# categorical varaibles
cat_atts = ['airconditioningtypeid','heatingorsystemtypeid','pooltypeid10','pooltypeid2',
            'pooltypeid7','propertylandusetypeid','taxdelinquencyflag']

# Dictionary of categorical variables and their default levels
cat_dict = {key:value for key,value in {'airconditioningtypeid':[-1] + list(range(1,14)),
           'architecturalstyletypeid':[-1] + list(range(1,28)),
           'buildingclasstypeid':[-1] + list(range(1,6)),
            'heatingorsystemtypeid':[-1] + list(range(1,26)),
            'pooltypeid10': list(range(-1,2)),
            'pooltypeid2': list(range(-1,2)),
            'pooltypeid7': list(range(-1,2)),
            'propertylandusetypeid': [-1, 31,46,47,246,247,248,260,261,262,263,264,265,266,267,268,269,270,271,
                                     273,274,275,276,279,290,291],
            'regionidcounty': [2061,3101,1286],
            'storytypeid':[-1] + list(range(1,36)),
            'typeconstructiontypeid':[-1] + list(range(1,19)),
            'yearbuilt': [-1] + list(range(1885,2018)),
            'fireplaceflag': [-1] + ['True','False'],
            'taxdelinquencyflag': [-1] + ['Y','N']
           }.items() if key in cat_atts}

# pairs to interact (x1,x2) where x1 is categorical and x2 is continuous
interact_pairs = [('regionidcounty','bathroomcnt'),('regionidcounty','bedroomcnt')
                 ,('regionidcounty','structuretaxvaluedollarcnt')]

In [9]:
# A custom transformer, which selects certain variables
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, desired_cols):
        self.desired_cols = desired_cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.desired_cols].values

# A custom transformer, which first selects the categorical variables
# from the DataFrame and then performs the dummification
class DF_Selector_GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self, cat_dict):
        self.cat_dict = cat_dict
        self.ndummies = sum(len(c) - 1  for c in cat_dict.values()) 
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.fillna(-1) # missing values are given -1 missing label
        foo = np.zeros((X.shape[0],self.ndummies))
        start = 0
        end = 0
        for c in sorted(self.cat_dict.keys()):
            end += len(self.cat_dict[c]) - 1
            foo[:, start:end] = pd.get_dummies(X[c].astype('category', categories=self.cat_dict[c]))[self.cat_dict[c][1:]]
            start += len(self.cat_dict[c]) - 1
        return foo

class Dummify_and_Interact(BaseEstimator, TransformerMixin):
    def __init__(self, interact_pairs, cat_dict):
        self.interact_pairs = interact_pairs
        self.cat_dict = cat_dict
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        features = None
        for pair in self.interact_pairs:
            x1,x2 = pair
            # impute x2 if missing
            imputer = Imputer()
            if np.isnan(X[x2]).any():
                x2vals = imputer.fit_transform(X[[x2]])
            else:
                x2vals = X[[x2]].as_matrix()
            # dummify x1 and multiply by x2vals
#             bar = ((pd.get_dummies(X[x1].astype('category', 
#                     categories=self.cat_dict[x1]))[self.cat_dict[x1][1:]]).as_matrix() * x2vals)
            bar = pd.get_dummies(X[x1].astype('category'), drop_first=True)
            if features is not None:
                features = np.concatenate((features,bar),axis=1)
            else:
                features = bar
        return features

In [10]:
# Categorical pipeline
cat_pipeline = Pipeline([
        ('select_and_dummify', DF_Selector_GetDummies(cat_dict)),
    ])

# Numerical pipeline
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_atts)),
        ('imputer', Imputer()),
    ])

# interaction pipeline
interact_pipeline = Pipeline([
        ('dummify_and_interact',Dummify_and_Interact(interact_pairs, cat_dict)),
    ])

# Full pipeline
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
        ("interact_pipeline", interact_pipeline)
    ])

In [11]:
import gc

In [12]:
gc.collect()

0

## Training Elastic Net

In [13]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

In [14]:
data['wts_oct'] = np.where(data['month'] == 10, 1.5, 1)
data['wts_nov'] = np.where(data['month'] == 11, 1.5, 1)
data['wts_dec'] = np.where(data['month'] == 12, 1.5, 1)

## Creating a train/test set

In [15]:
full_pipeline.fit(properties)
X_train = full_pipeline.transform(data)
X_train.shape

(90275, 90)

In [16]:
Y_train = data['logerror'].values

In [17]:
ridge = ElasticNet(alpha=1, l1_ratio = 0, max_iter=1000)

In [18]:
ridge.fit(X_train, Y_train)



ElasticNet(alpha=1, copy_X=True, fit_intercept=True, l1_ratio=0,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [19]:
coef = ridge.coef_

In [20]:
n_samples, n_features = X_train.shape

In [21]:
weight = np.ones(n_features)

In [22]:
# when gamma=0.5, alpha should be between 0.1 and 0.001
gamma = 0.5
alpha = 0.01

In [23]:
weight[coef != 0] = 1/(abs(coef[coef != 0])**gamma)

In [24]:
X_w = X_train/weight[np.newaxis, :]

In [25]:
clf = Lasso(alpha=alpha, fit_intercept=True)

In [26]:
clf.fit(X_w, Y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [27]:
clf.coef_ = clf.coef_/weight

In [28]:
clf.coef_

array([  6.18173785e-07,   0.00000000e+00,   1.27821281e-08,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,  -0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,  -0.00000000e+00,  -0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,  -0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,

In [29]:
coef

array([  8.51507357e-06,  -1.44497447e-04,  -1.03769905e-08,
         1.21071386e-04,  -2.23147094e-04,  -2.56125534e-05,
        -7.40405402e-10,   3.86200155e-05,  -2.99226541e-05,
        -1.12168583e-05,   1.02077246e-04,   2.91585228e-04,
        -2.67074523e-04,   1.15114203e-04,   0.00000000e+00,
         9.57517013e-07,   0.00000000e+00,   1.09433580e-05,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         9.85054254e-09,   0.00000000e+00,   1.37441781e-05,
         0.00000000e+00,   1.31568127e-05,   2.46301444e-06,
         4.36800453e-04,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,  -1.09475300e-05,   6.43625402e-05,
         0.00000000e+00,   0.00000000e+00,  -7.74626377e-07,
        -3.18639410e-07,  -1.86020726e-07,  -8.61481672e-06,
        -3.91619804e-07,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.41607273e-06,   0.00000000e+00,
        -1.62994267e-05,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,

In [30]:
print("predicting")
print("prediction using adaptive lass model...")

submission_df = generate_submissions(clf, clf, clf, name="Adp-lasso-af")

predicting
prediction using adaptive lass model...


In [31]:
mean_absolute_errors(submission_df, data)

(0.06346126018645491,
 0.06249241099265744,
 0.07486079769105185,
 0.06557489599687634)

In [35]:
Ridge = generate_submissions(oct_enet, oct_enet, oct_enet, name="Ridge")
mean_absolute_errors(submission_df, data)

(0.06357440087254919,
 0.06264618278176387,
 0.07537971498141846,
 0.065779331216912304)

In [36]:
Enet = generate_submissions(nov_enet, nov_enet, nov_enet, name="ElasticNet")
mean_absolute_errors(submission_df, data)

(0.06357440087254919,
 0.06264618278176387,
 0.07537971498141846,
 0.065779331216912304)

In [37]:
Lasso = generate_submissions(dec_enet, dec_enet, dec_enet, name="Lasso")
mean_absolute_errors(submission_df, data)

(0.06357440087254919,
 0.06264618278176387,
 0.07537971498141846,
 0.065779331216912304)

In [43]:
amyhighscore = pd.read_csv("9010_XGB_3000_RF.gz", compression="gzip")
mean_absolute_errors(amyhighscore, data)

(0.06221434396363281,
 0.06144365500547652,
 0.07391738266532494,
 0.064432127417700866)

In [25]:
submission_df.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,-0.139763,-0.139763,-0.139763,0,0,0
1,10759547,-0.139977,-0.139977,-0.139977,0,0,0
2,10843547,0.214607,0.214607,0.214607,0,0,0
3,10859147,-0.00525,-0.00525,-0.00525,0,0,0
4,10879947,0.003303,0.003303,0.003303,0,0,0


In [40]:
Enet.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,-0.019564,-0.019564,-0.019564,0,0,0
1,10759547,-0.019629,-0.019629,-0.019629,0,0,0
2,10843547,0.212738,0.212738,0.212738,0,0,0
3,10859147,0.016817,0.016817,0.016817,0,0,0
4,10879947,0.010969,0.010969,0.010969,0,0,0


In [38]:
Lasso.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,-0.018543,-0.018543,-0.018543,0,0,0
1,10759547,-0.018589,-0.018589,-0.018589,0,0,0
2,10843547,0.203752,0.203752,0.203752,0,0,0
3,10859147,0.018183,0.018183,0.018183,0,0,0
4,10879947,0.010949,0.010949,0.010949,0,0,0


## Model Averaging 

In [42]:
RF = pd.read_csv("RF_n100_maxfeat5_maxdepth8.gz", compression="gzip")
mean_absolute_errors(RF, data)

(0.0627264215390796,
 0.06182163198247547,
 0.07473668775158138,
 0.064978084757667934)

In [50]:
RF_2 = pd.read_csv("RF_n100_maxfeat10_maxdepth20_extreme.gz", compression="gzip")
mean_absolute_errors(RF_2, data)

(0.06195625879043588,
 0.06436095290251921,
 0.07321730879815982,
 0.064762854132521586)

In [43]:
XGB = pd.read_csv("XGB_600.gz", compression="gzip")
mean_absolute_errors(XGB, data)

(0.06211727600763514,
 0.06136520946002188,
 0.07414289916043706,
 0.064404712807773379)

In [47]:
XGB3000 = pd.read_csv("XGB_3000.csv")
mean_absolute_errors(XGB3000, data)

(0.05998706659714675,
 0.05918412810186208,
 0.07170160296434737,
 0.06220029687696086)

In [51]:
blend3 = submission_df[['ParcelId']].copy()
cols = ['201610','201611','201612','201710','201711','201712']
blend3 = pd.concat([blend3,(20/100)*Ridge[cols] + (20/100)*Enet[cols] + (20/100)*Lasso[cols] +
                    (18/100)*XGB[cols] + (18/100)*RF[cols] + (2/100)*XGB3000[cols]
                   +(2/100)*RF_2[cols]], axis=1)
blend3['ParcelId'] = blend3['ParcelId'].astype(int)
assert all(blend3.ParcelId.unique() == submission_df.ParcelId.unique())
mean_absolute_errors(blend3, data)

(0.06268473765862069,
 0.061797918439336595,
 0.07437674098281555,
 0.064875449656555853)

In [52]:
blend3.to_csv("new_ensemble_2.gz", index=False, float_format='%.4g', compression='gzip')

In [27]:
submission_df.to_csv("theilson.gz", index=False, float_format='%.4g', compression='gzip')