In [2]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Submission Functions 

In [3]:
def generate_regression_preds(reg,test_df, model_name='pred_logerror', transactiondate='2016-12-01'):
    test_df['transactiondate'] = pd.Timestamp(transactiondate)  # Dummy
    test_df = add_date_features(test_df)
    reg_preds = None
    for i in range(int(test_df.shape[0] / 100000)):   
        # get current test features
        current_test_feats = feature_pipeline.transform(test_df.iloc[i*100000:(i+1)*100000])

        # predict on current test obs
        current_preds = Series(reg.predict(current_test_feats), name=model_name,
                              index = np.arange(i*100000,(i+1)*100000))

        if reg_preds is not None:
            reg_preds = pd.concat([reg_preds, current_preds])
        else:
            reg_preds = current_preds

    #  fencepost problem
    current_test_feats = feature_pipeline.transform(test_df.iloc[2900000:])
    current_preds = Series(reg.predict(current_test_feats), name=model_name,
                          index = np.arange(2900000,2985217))
    reg_preds = pd.concat([reg_preds, current_preds])
    return reg_preds

In [4]:
def generate_submissions(oct_model,nov_model,dec_model,name='new_submission',logy=True):
    """
    This function creates the submission file for the public leaderboard predictions.
    Three already fitted models, one for each of the predicting time points, is required.
    """
    submission_df = DataFrame()
    for i in range(int(properties.shape[0] / 100000)):
        all_feats = full_pipeline.transform(properties.iloc[i*100000:(i+1)*100000])
        foo = properties.iloc[i*100000:(i+1)*100000][['parcelid']].reset_index(drop=True)
        if logy:
            foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                            '201611': nov_model.predict(all_feats),
                                                            '201612': dec_model.predict(all_feats)})], axis=1)
        else:
            foo = pd.concat([foo, DataFrame({'201610': np.log(oct_model.predict(all_feats)),
                                                            '201611': np.log(nov_model.predict(all_feats)),
                                                            '201612': np.log(dec_model.predict(all_feats))})], axis=1)
        submission_df = pd.concat([submission_df, foo], ignore_index=True)

    #  fencepost problem
    all_feats = full_pipeline.transform(properties.iloc[2900000:])
    foo = properties.iloc[2900000:][['parcelid']].reset_index(drop=True)
    foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                    '201611': nov_model.predict(all_feats),
                                                    '201612': dec_model.predict(all_feats)})], axis=1)
    submission_df = pd.concat([submission_df, foo], ignore_index=True)
    
    submission_df['201710'] = 0
    submission_df['201711'] = 0
    submission_df['201712'] = 0
    
    submission_df.rename(columns={'parcelid':'ParcelId'}, inplace=True)    
#     submission_df[['201610','201611','201612','201710','201711','201712']]= submission_df[['201610','201611','201612',
#                                                                                            '201710','201711','201712']].round(4)
    # unit test
    submission_df.drop_duplicates(inplace=True)
    assert submission_df.shape[0] == properties.shape[0]
    # write to .csv
    submission_df[['ParcelId','201610','201611','201612',
                  '201710','201711','201712']].to_csv(name + ".gz", index=False, float_format='%.4g', compression='gzip')
    return submission_df

In [5]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [6]:
maindir = "/home/anerdi/Desktop/Zillow"

# train_df = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv", parse_dates=['transactiondate'], low_memory=False)
train_df = pd.read_csv("/home/anerdi/Desktop/Zillow/data/traindata20162017.csv.gz", parse_dates=['transactiondate'], low_memory=False)
test_df = pd.read_csv(maindir + "/data/sample_submission.csv", low_memory=False)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", low_memory=False)
# field is named differently in submission
test_df['parcelid'] = test_df['ParcelId']

In [7]:
# similar to the1owl
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

In [8]:
train_df = add_date_features(train_df)
# train_df = train_df.merge(properties, how='left', on='parcelid')
test_df = test_df.merge(properties, how='left', on='parcelid')
print("Train: ", train_df.shape)
print("Test: ", test_df.shape)

Train:  (167888, 66)
Test:  (2985217, 65)


In [9]:
for c in ['propertycountylandusecode']:
    label_enc = LabelEncoder()
    test_df[c] = label_enc.fit_transform(test_df[c].astype(str))
    train_df[c] = label_enc.transform(train_df[c].astype(str))

In [10]:
# some out of range int is a good choice
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)
test_df['transactiondate'] = pd.Timestamp('2016-12-01')  # Dummy
test_df = add_date_features(test_df)

In [11]:
test_df['age'] = 2017 - test_df['yearbuilt']
train_df['age'] = 2017 - train_df['yearbuilt']

In [12]:
test_df['heatingorsystemtypeid'].nunique()

15

### Data  Preprocessing Pipeline

In [13]:
# Setup variables considered in the model

# train_feats = ['airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr',
#                'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12',
#                'finishedsquarefeet15', 'finishedsquarefeet50', 'fips', 'fireplacecnt', 'fullbathcnt', 
#                'garagecarcnt', 'garagetotalsqft', 'heatingorsystemtypeid', 'latitude', 'longitude', 
#                'lotsizesquarefeet', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock',
#                'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 
#                'threequarterbathnbr', 'unitcnt', 'yardbuildingsqft17', 'yearbuilt', 'numberofstories', 
#                'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 
#                'taxamount', 'taxdelinquencyyear', 'censustractandblock', 'transaction_year', 'transaction_month', 
#                'transaction_day', 'transaction_quarter']

# # categorical varaibles
# cat_atts = ['airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 
#             'propertycountylandusecode', 'propertylandusetypeid', 'regionidcity', 'regionidcounty',
#             'regionidneighborhood', 'regionidzip', 'yearbuilt', 'assessmentyear', 'taxdelinquencyyear', 
#             'transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter']

train_feats = ['bedroomcnt','calculatedbathnbr','age',
           'calculatedfinishedsquarefeet','fullbathcnt','garagecarcnt','garagetotalsqft',
            'latitude','longitude','lotsizesquarefeet', 'roomcnt',
           'numberofstories','structuretaxvaluedollarcnt','taxvaluedollarcnt','landtaxvaluedollarcnt','taxamount',   
            'propertycountylandusecode',
            'propertylandusetypeid',
            'regionidzip',
            'buildingqualitytypeid',
            'heatingorsystemtypeid',
            'transaction_quarter',
            'transaction_year', 
            'transaction_month'
              ]

# categorical varaibles
cat_atts = [
            'propertycountylandusecode',
            'propertylandusetypeid', 
            'regionidzip',
            'buildingqualitytypeid',
            'heatingorsystemtypeid',
            'transaction_quarter',
            'transaction_year', 
            'transaction_month'
              ]

# numerical variables
num_atts = [c for c in train_feats if c not in cat_atts]

# Dictionary of categorical variables and their default levels
cat_dict = {c:np.union1d(test_df[c].unique(), train_df[c].unique()) for c in cat_atts}

In [14]:
# Categorical pipeline
cat_pipeline = Pipeline([
        ('select_and_dummify', pipes.DF_Selector_GetDummies(cat_dict)),
    ])

# Numerical pipeline
num_pipeline = Pipeline([
        ('selector', pipes.DataFrameSelector(num_atts)),
#         ('imputer', Imputer()),
    ])

# Full pipeline
feature_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline)
    ])

In [15]:
feature_pipeline.fit(test_df) #fitting the pipeline to the entire properties dataframe

FeatureUnion(n_jobs=1,
       transformer_list=[('num_pipeline', Pipeline(memory=None,
     steps=[('selector', DataFrameSelector(desired_cols=['bedroomcnt', 'calculatedbathnbr', 'age', 'calculatedfinishedsquarefeet', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'latitude', 'longitude', 'lotsizesquarefeet', 'roomcnt', 'numb...   7.,    8.,
          9.,   10.,   11.,   12.]), 'transaction_quarter': array([1, 2, 3, 4])}))]))],
       transformer_weights=None)

In [16]:
X_train = feature_pipeline.transform(train_df)
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

(167888, 717) (167888,)


In [17]:
import gc

In [18]:
gc.collect()

124

## Training XGB

In [19]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.base import clone

import warnings
warnings.filterwarnings("ignore")

In [20]:
xgb_params = {}
xgb_params['n_estimators'] = 100
xgb_params['learning_rate'] = 0.2
xgb_params['max_depth'] = 5
xgb_params['subsample'] = 0.6
xgb_params['reg_lambda'] = 5
xgb_params['gamma'] = 0.01
xgb_params['colsample_bytree'] = 0.7
xgb_params['silent'] = 1

In [21]:
models = [
    ("xgb",XGBRegressor(**xgb_params))
]

In [22]:
test_predictions = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

for pair in models:
    current_model_name,current_model = pair
    print("Current model: %s" % current_model_name)
    
    num_ensembles = 5
    y_pred_oct = Series(np.zeros(2985217), name = "{0}_201610".format(current_model_name))
    y_pred_nov = Series(np.zeros(2985217), name = "{0}_201611".format(current_model_name))
    y_pred_dec = Series(np.zeros(2985217), name = "{0}_201612".format(current_model_name))
    for i in range(num_ensembles):            
        # get a clone of the model and fit the current training data
        print(i+1)
        reg = clone(current_model)
        reg.set_params(random_state=i)
        
        print("...fitting model")
        reg.fit(X_train, y_train)

        print("...obtaining predictions on test set")
        # obtain predictions on test set  
        y_pred_oct = y_pred_oct + generate_regression_preds(reg, test_df, model_name="{0}_201610".format(current_model_name),
                                                            transactiondate='2016-10-01')
        y_pred_nov = y_pred_nov + generate_regression_preds(reg, test_df, model_name="{0}_201611".format(current_model_name), 
                                                            transactiondate='2016-11-01')
        y_pred_dec = y_pred_dec + generate_regression_preds(reg, test_df, model_name="{0}_201612".format(current_model_name),
                                                            transactiondate='2016-12-01')
        
        del reg
        gc.collect()

    # model averaging    
    y_pred_oct = y_pred_oct / num_ensembles
    y_pred_nov = y_pred_nov / num_ensembles
    y_pred_dec = y_pred_dec / num_ensembles
    test_predictions = pd.concat([test_predictions, y_pred_oct, y_pred_nov, y_pred_dec], axis=1)

Current model: xgb
1
...fitting model
...obtaining predictions on test set


MemoryError: 

In [81]:
test_predictions.head()

Unnamed: 0,parcelid,xgb_201610,xgb_201611,xgb_201612
0,10754147,0.361151,0.361151,0.361019
1,10759547,0.888566,0.910082,0.909951
2,10843547,0.129308,0.15806,0.130208
3,10859147,0.133904,0.133904,0.134803
4,10879947,0.014055,0.014055,0.014955


In [82]:
model_name = 'xgb'
new_submission_one_stage = DataFrame({'ParcelId': test_predictions['parcelid'],
                           '201610':test_predictions['%s_201610' % model_name],
                           '201611':test_predictions['%s_201611' % model_name],
                           '201612':test_predictions['%s_201612' % model_name],
})
new_submission_one_stage['201710'] = 0
new_submission_one_stage['201711'] = 0
new_submission_one_stage['201712'] = 0

In [83]:
# current best 0.0653659
mean_absolute_errors(new_submission_one_stage.round(4), train_df)

(0.06258292143861763,
 0.061867634173055894,
 0.0741556066705004,
 0.064786010302037025)

In [84]:
new_submission_one_stage.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/xgboost.csv.gz",
                     compression='gzip', index=False)