In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Submission Functions 

In [2]:
def generate_regression_preds(reg, model_name='pred_logerror'):
    reg_preds = None
    for i in range(int(properties.shape[0] / 100000)):   
        # get current test features
        current_test_feats = feature_pipeline.transform(properties.iloc[i*100000:(i+1)*100000])

        # predict on current test obs
        current_preds = Series(reg.predict(current_test_feats), name=model_name,
                              index = np.arange(i*100000,(i+1)*100000))

        if reg_preds is not None:
            reg_preds = pd.concat([reg_preds, current_preds])
        else:
            reg_preds = current_preds

    #  fencepost problem
    current_test_feats = feature_pipeline.transform(properties.iloc[2900000:])
    current_preds = Series(reg.predict(current_test_feats), name=model_name,
                          index = np.arange(2900000,2985217))
    reg_preds = pd.concat([reg_preds, current_preds])
    return reg_preds

In [3]:
def generate_submissions(oct_model,nov_model,dec_model,name='new_submission',logy=True):
    """
    This function creates the submission file for the public leaderboard predictions.
    Three already fitted models, one for each of the predicting time points, is required.
    """
    submission_df = DataFrame()
    for i in range(int(properties.shape[0] / 100000)):
        all_feats = full_pipeline.transform(properties.iloc[i*100000:(i+1)*100000])
        foo = properties.iloc[i*100000:(i+1)*100000][['parcelid']].reset_index(drop=True)
        if logy:
            foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                            '201611': nov_model.predict(all_feats),
                                                            '201612': dec_model.predict(all_feats)})], axis=1)
        else:
            foo = pd.concat([foo, DataFrame({'201610': np.log(oct_model.predict(all_feats)),
                                                            '201611': np.log(nov_model.predict(all_feats)),
                                                            '201612': np.log(dec_model.predict(all_feats))})], axis=1)
        submission_df = pd.concat([submission_df, foo], ignore_index=True)

    #  fencepost problem
    all_feats = full_pipeline.transform(properties.iloc[2900000:])
    foo = properties.iloc[2900000:][['parcelid']].reset_index(drop=True)
    foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                    '201611': nov_model.predict(all_feats),
                                                    '201612': dec_model.predict(all_feats)})], axis=1)
    submission_df = pd.concat([submission_df, foo], ignore_index=True)
    
    submission_df['201710'] = 0
    submission_df['201711'] = 0
    submission_df['201712'] = 0
    
    submission_df.rename(columns={'parcelid':'ParcelId'}, inplace=True)    
#     submission_df[['201610','201611','201612','201710','201711','201712']]= submission_df[['201610','201611','201612',
#                                                                                            '201710','201711','201712']].round(4)
    # unit test
    submission_df.drop_duplicates(inplace=True)
    assert submission_df.shape[0] == properties.shape[0]
    # write to .csv
    submission_df[['ParcelId','201610','201611','201612',
                  '201710','201711','201712']].to_csv(name + ".gz", index=False, float_format='%.4g', compression='gzip')
    return submission_df

In [4]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [5]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
#proportion of living area
properties['N-LivingAreaProp'] = properties['calculatedfinishedsquarefeet']/properties['lotsizesquarefeet']

#Ratio of the built structure value to land area
properties['N-ValueProp'] = properties['structuretaxvaluedollarcnt']/properties['landtaxvaluedollarcnt']

#Ratio of tax of property over parcel
properties['N-ValueRatio'] = properties['taxvaluedollarcnt']/properties['taxamount']

# Pool
properties['Pool'] = (properties['pooltypeid2'].fillna(0) + properties['pooltypeid7'].fillna(0)).astype(int)

In [7]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')
data['wts_xgb_oct'] = np.where(data['month'] == 10, 1.25, 1)
data['wts_xgb_nov'] = np.where(data['month'] == 11, 1.25, 1)
data['wts_xgb_dec'] = np.where(data['month'] == 12, 1.25, 1)

### Data  Preprocessing Pipeline

In [8]:
# Setup variables considered in the model

# numerical variables
num_atts = ['bathroomcnt','bedroomcnt','buildingqualitytypeid','calculatedbathnbr','finishedfloor1squarefeet',
           'calculatedfinishedsquarefeet','finishedsquarefeet12','finishedsquarefeet13',
           'finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6','fireplacecnt',
           'fullbathcnt','garagecarcnt','garagetotalsqft','latitude','longitude','lotsizesquarefeet',
           'poolcnt','poolsizesum','censustractandblock','roomcnt','threequarterbathnbr','unitcnt',
           'yardbuildingsqft17','yardbuildingsqft26','numberofstories',
            'structuretaxvaluedollarcnt','taxvaluedollarcnt','landtaxvaluedollarcnt','taxamount',
           'N-ValueRatio', 'N-LivingAreaProp', 'N-ValueProp']

# categorical varaibles
cat_atts = ['airconditioningtypeid','architecturalstyletypeid',
           'buildingclasstypeid','heatingorsystemtypeid','pooltypeid10','pooltypeid2',
            'pooltypeid7','propertylandusetypeid','regionidcounty',
           'storytypeid','typeconstructiontypeid','yearbuilt','fireplaceflag',
           'taxdelinquencyflag']

# Dictionary of categorical variables and their default levels
cat_dict = {'airconditioningtypeid':[-1] + list(range(1,14)),
           'architecturalstyletypeid':[-1] + list(range(1,28)),
           'buildingclasstypeid':[-1] + list(range(1,6)),
            'heatingorsystemtypeid':[-1] + list(range(1,26)),
            'pooltypeid10': list(range(-1,2)),
            'pooltypeid2': list(range(-1,2)),
            'pooltypeid7': list(range(-1,2)),
            'propertylandusetypeid': [-1, 31,46,47,246,247,248,260,261,262,263,264,265,266,267,268,269,270,271,
                                     273,274,275,276,279,290,291],
            'regionidcounty': [2061,3101,1286],
            'storytypeid':[-1] + list(range(1,36)),
            'typeconstructiontypeid':[-1] + list(range(1,19)),
            'yearbuilt': [-1] + list(range(1885,2018)),
            'fireplaceflag': [-1] + ['True','False'],
            'taxdelinquencyflag': [-1] + ['Y','N']
           }

In [9]:
# Categorical pipeline
cat_pipeline = Pipeline([
        ('select_and_dummify', pipes.DF_Selector_GetDummies(cat_dict)),
    ])

# Numerical pipeline
num_pipeline = Pipeline([
        ('selector', pipes.DataFrameSelector(num_atts)),
        ('imputer', Imputer()),
    ])

# Full pipeline
feature_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline)
    ])

In [10]:
feature_pipeline.fit(data) #fitting the pipeline to the entire properties dataframe

FeatureUnion(n_jobs=1,
       transformer_list=[('num_pipeline', Pipeline(memory=None,
     steps=[('selector', DataFrameSelector(desired_cols=['bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet13', 'fi...013, 2014, 2015, 2016, 2017], 'pooltypeid7': [-1, 0, 1], 'taxdelinquencyflag': [-1, 'Y', 'N']}))]))],
       transformer_weights=None)

In [11]:
import gc

In [12]:
gc.collect()

20

## Splitting the Training Set

In [13]:
ix_overestimated = np.where(data['logerror'] >= 0)[0]
ix_underestimated = np.where(data['logerror'] < 0)[0]
data_indices = {"over": ix_overestimated, "under": ix_underestimated}

In [14]:
assert ix_overestimated.shape[0] + ix_underestimated.shape[0] == data.shape[0]

## Training XGB

In [16]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.base import clone

import warnings
warnings.filterwarnings("ignore")

In [21]:
models = [
    ("xgb_oct",XGBRegressor(random_state=42, n_estimators=400, max_depth=3, learning_rate=0.02,
                          subsample= 1, colsample_bytree= 1)),
    ("xgb_nov",XGBRegressor(random_state=42, n_estimators=400, max_depth=3, learning_rate=0.02,
                          subsample= 1, colsample_bytree= 1)),
    ("xgb_dec",XGBRegressor(random_state=42, n_estimators=400, max_depth=3, learning_rate=0.02,
                          subsample= 1, colsample_bytree= 1)),
]

In [22]:
test_predictions = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

for pair in models:
    current_model_name,current_model = pair
    print("Current model: %s" % current_model_name)
    
    for key,val in data_indices.items():
        type_of_zestimate, ix = key, val

        # preprocess current training data
        current_traindata = data.iloc[ix,]

        # get a clone of the model and fit the current training data
        reg = clone(current_model)
        reg.fit(feature_pipeline.transform(current_traindata), current_traindata['logerror'],
               sample_weight = current_traindata['wts_%s' % current_model_name])

        # obtain predictions on test set
        reg_preds = generate_regression_preds(reg, model_name="%s_%s" % (current_model_name,type_of_zestimate))
        test_predictions = pd.concat([test_predictions, reg_preds], axis=1)

Current model: xgb_oct
Current model: xgb_nov
Current model: xgb_dec


In [23]:
test_predictions.head()

Unnamed: 0,parcelid,xgb_oct_under,xgb_oct_over,xgb_nov_under,xgb_nov_over,xgb_dec_under,xgb_dec_over
0,10754147,-0.198943,0.087975,-0.209733,0.085679,-0.193651,0.087517
1,10759547,-0.11094,0.081205,-0.114238,0.079867,-0.110725,0.081849
2,10843547,-0.269981,0.624938,-0.287432,0.675193,-0.259808,0.618806
3,10859147,-0.224659,0.487692,-0.223898,0.5115,-0.19966,0.479048
4,10879947,-0.282351,0.271146,-0.28656,0.225149,-0.333289,0.256966


In [25]:
test_predictions.to_csv("/home/anerdi/Desktop/Zillow/twostagemodel/XGB-two-stage-preds.csv")

In [26]:
overestimate_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs.csv")

In [27]:
overestimate_probabilities.head()

Unnamed: 0,parcelid,overestimate_prob
0,10754147,0.509328
1,10759547,0.507312
2,10843547,1.0
3,10859147,0.689618
4,10879947,0.543763


#### Merging Results

In [28]:
test_predictions = pd.merge(test_predictions, overestimate_probabilities, on='parcelid')

In [29]:
for pair in models:
    current_model_name, current_model = pair
    # combine over and under to get prediction
    test_predictions[current_model_name] = (test_predictions['%s_over' % current_model_name]*test_predictions['overestimate_prob'] +
                    test_predictions['%s_under' % current_model_name]*(1 - test_predictions['overestimate_prob']))

In [30]:
test_predictions.head()

Unnamed: 0,parcelid,xgb_oct_under,xgb_oct_over,xgb_nov_under,xgb_nov_over,xgb_dec_under,xgb_dec_over,overestimate_prob,xgb_oct,xgb_nov,xgb_dec
0,10754147,-0.198943,0.087975,-0.209733,0.085679,-0.193651,0.087517,0.509328,-0.052808,-0.059271,-0.050444
1,10759547,-0.11094,0.081205,-0.114238,0.079867,-0.110725,0.081849,0.507312,-0.013462,-0.015767,-0.01303
2,10843547,-0.269981,0.624938,-0.287432,0.675193,-0.259808,0.618806,1.0,0.624938,0.675193,0.618806
3,10859147,-0.224659,0.487692,-0.223898,0.5115,-0.19966,0.479048,0.689618,0.266591,0.283246,0.268389
4,10879947,-0.282351,0.271146,-0.28656,0.225149,-0.333289,0.256966,0.543763,0.01862,-0.008311,-0.01233


In [31]:
new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                           '201610':test_predictions['xgb_oct'],
                           '201611':test_predictions['xgb_nov'],
                           '201612':test_predictions['xgb_dec']})
new_submission['201710'] = 0
new_submission['201711'] = 0
new_submission['201712'] = 0

In [32]:
# two-stage with rf
mean_absolute_errors(new_submission.round(4), data)

(0.06357010247136825,
 0.062275520262869734,
 0.07545290396779751,
 0.065712491219854813)

In [33]:
# previous RF
RF = pd.read_csv("/home/anerdi/Desktop/Zillow/submissions/XGB_600.gz", compression="gzip")
mean_absolute_errors(RF, data)

(0.06211727600763514,
 0.06136520946002188,
 0.07414289916043706,
 0.064404712807773379)

In [34]:
new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_xgb.csv.gz", index=False,
                     compression='gzip')