In [1]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Submission Functions 

In [2]:
def generate_regression_preds(reg,X_test, model_name='pred_logerror', transactiondate='2016-12-01'):
    X_test['transactiondate'] = pd.Timestamp(transactiondate)  # Dummy
    X_test = add_date_features(X_test)
    reg_preds = None
    for i in range(int(properties.shape[0] / 100000)):   
        # get current test features
        current_test_feats = X_test.iloc[i*100000:(i+1)*100000]

        # predict on current test obs
        current_preds = Series(reg.predict(current_test_feats), name=model_name,
                              index = np.arange(i*100000,(i+1)*100000))

        if reg_preds is not None:
            reg_preds = pd.concat([reg_preds, current_preds])
        else:
            reg_preds = current_preds

    #  fencepost problem
    current_test_feats = X_test.iloc[2900000:]
    current_preds = Series(reg.predict(current_test_feats), name=model_name,
                          index = np.arange(2900000,2985217))
    reg_preds = pd.concat([reg_preds, current_preds])
    return reg_preds

In [3]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [4]:
maindir = "/home/anerdi/Desktop/Zillow"

# train_df = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv", parse_dates=['transactiondate'], low_memory=False)
train_df = pd.read_csv("/home/anerdi/Desktop/Zillow/data/traindata20162017.csv.gz", parse_dates=['transactiondate'], low_memory=False)
test_df = pd.read_csv(maindir + "/data/sample_submission.csv", low_memory=False)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", low_memory=False)
# field is named differently in submission
test_df['parcelid'] = test_df['ParcelId']

In [5]:
del train_df['Unnamed: 0']

### Data  Preprocessing Pipeline

In [6]:
# similar to the1owl
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

In [7]:
train_df = add_date_features(train_df)
# train_df = train_df.merge(properties, how='left', on='parcelid')
test_df = test_df.merge(properties, how='left', on='parcelid')
print("Train: ", train_df.shape)
print("Test: ", test_df.shape)

Train:  (167888, 65)
Test:  (2985217, 65)


### 0.a) Remove missing data fields

In [8]:
missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print("We exclude: %s" % exclude_missing)
print(len(exclude_missing))

We exclude: ['architecturalstyletypeid', 'basementsqft', 'buildingclasstypeid', 'decktypeid', 'finishedsquarefeet13', 'finishedsquarefeet6', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'storytypeid', 'typeconstructiontypeid', 'yardbuildingsqft26', 'fireplaceflag']
13


### 0.b) Remove data that is always the same

In [9]:
# exclude where we only have one unique value :D
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % exclude_unique)
print(len(exclude_unique))

We exclude: ['decktypeid', 'hashottuborspa', 'poolcnt', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'fireplaceflag', 'taxdelinquencyflag']
9


### 1.a) Define training features

In [10]:
exclude_other = ['parcelid', 'logerror','year','month']  # for indexing/training only
# do not know what this is LARS, 'SHCG' 'COR2YY' 'LNR2RPD-R3' ?!?
exclude_other.append('propertyzoningdesc')
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % train_features)
print(len(train_features))

We use these for training: ['airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet15', 'finishedsquarefeet50', 'fips', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yardbuildingsqft17', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'taxdelinquencyyear', 'censustractandblock', 'transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter']
43


### 1.b) Define which of these training features are categorical

In [11]:
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

Cat features are: ['airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt', 'assessmentyear', 'taxdelinquencyyear', 'transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter']


### 1.c) Fill missing values

In [12]:
# some out of range int is a good choice
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

## Splitting the Training Set

In [13]:
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

(167888, 43) (167888,)


In [14]:
ix_overestimated = np.where(y_train >= 0)[0]
ix_underestimated = np.where(y_train < 0)[0]
data_indices = {"over": ix_overestimated, "under": ix_underestimated}
data_indices = {"over": ix_overestimated, "under": ix_underestimated}

In [15]:
assert ix_overestimated.shape[0] + ix_underestimated.shape[0] == X_train.shape[0]

In [16]:
ix_overestimated.shape

(94505,)

In [17]:
ix_underestimated.shape

(73383,)

In [18]:
test_df['transactiondate'] = pd.Timestamp('2016-12-01')  # Dummy
test_df = add_date_features(test_df)
X_test = test_df[train_features]
print(X_test.shape)

(2985217, 43)


## Training CatBoost

In [19]:
from catboost import CatBoostRegressor
from sklearn.base import clone

import warnings
warnings.filterwarnings("ignore")

In [20]:
models = [
    ("catboost1", CatBoostRegressor(
                iterations=200, learning_rate=0.03,
                depth=6, l2_leaf_reg=3,
                loss_function='MAE',
                eval_metric='MAE')
    ),
    ("catboost2", CatBoostRegressor(
                iterations=200, learning_rate=0.05,
                depth=5, l2_leaf_reg=5,
                loss_function='MAE',
                eval_metric='MAE')
    ),
    ("catboost3", CatBoostRegressor(
                iterations=200, learning_rate=0.05,
                depth=5, l2_leaf_reg=7,
                loss_function='MAE',
                eval_metric='MAE')
    ),
]

### One-stage model

In [21]:
test_predictions_one_stage = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

for pair in models:
    current_model_name,current_model = pair
    print("Current model: %s" % current_model_name)
    
    num_ensembles = 5
    y_pred_oct = Series(np.zeros(2985217), name = "{0}_201610".format(current_model_name))
    y_pred_nov = Series(np.zeros(2985217), name = "{0}_201611".format(current_model_name))
    y_pred_dec = Series(np.zeros(2985217), name = "{0}_201612".format(current_model_name))
    for i in range(num_ensembles):            
        # get a clone of the model and fit the current training data
        print(i+1)
        reg = clone(current_model)
        reg.set_params(random_seed=i)

        print("...fitting model")
        reg.fit(X_train, y_train,cat_features=cat_feature_inds)

        print("...obtaining predictions on test set")
        # obtain predictions on test set  
        y_pred_oct = y_pred_oct + generate_regression_preds(reg, X_test, model_name="{0}_201610".format(current_model_name),
                                                            transactiondate='2016-10-01')
        y_pred_nov = y_pred_nov + generate_regression_preds(reg, X_test, model_name="{0}_201611".format(current_model_name), 
                                                            transactiondate='2016-11-01')
        y_pred_dec = y_pred_dec + generate_regression_preds(reg, X_test, model_name="{0}_201612".format(current_model_name),
                                                            transactiondate='2016-12-01')

    # model averaging    
    y_pred_oct = y_pred_oct / num_ensembles
    y_pred_nov = y_pred_nov / num_ensembles
    y_pred_dec = y_pred_dec / num_ensembles
    test_predictions_one_stage = pd.concat([test_predictions_one_stage, y_pred_oct, y_pred_nov, y_pred_dec], axis=1)

Current model: catboost2
1
...fitting model
...obtaining predictions on test set


In [39]:
cat_feats_importance = sorted([(-reg.feature_importances_[ix], train_features[ix]) for ix in cat_feature_inds])

In [40]:
cat_feats_importance

[(-7.6865824404776255, 'transaction_month'),
 (-7.294274950257516, 'propertycountylandusecode'),
 (-5.818644878328299, 'regionidzip'),
 (-3.6599379203024918, 'yearbuilt'),
 (-3.403491862177784, 'propertylandusetypeid'),
 (-3.1541412521002385, 'regionidneighborhood'),
 (-2.897418110866512, 'regionidcity'),
 (-1.9124589847592146, 'taxdelinquencyyear'),
 (-1.8406540308430321, 'buildingqualitytypeid'),
 (-1.6357444772830403, 'transaction_quarter'),
 (-1.5647768242706777, 'heatingorsystemtypeid'),
 (-1.4587876368194137, 'transaction_day'),
 (-1.2330244313233854, 'regionidcounty'),
 (-1.0268083373933814, 'airconditioningtypeid'),
 (-0.9556557314257367, 'transaction_year'),
 (-0.8917645807364831, 'assessmentyear'),
 (-0.5437287387925018, 'fips')]

In [28]:
model_name = 'catboost3'
new_submission_one_stage = DataFrame({'ParcelId': test_predictions_one_stage['parcelid'],
                           '201610':test_predictions_one_stage['%s_201610' % model_name],
                           '201611':test_predictions_one_stage['%s_201611' % model_name],
                           '201612':test_predictions_one_stage['%s_201612' % model_name],
})
new_submission_one_stage['201710'] = 0
new_submission_one_stage['201711'] = 0
new_submission_one_stage['201712'] = 0

In [29]:
mean_absolute_errors(new_submission_one_stage.round(4), train_df)

(0.0615890496282902,
 0.059950000000000024,
 0.07292461184588844,
 0.063546394287052249)

In [31]:
# current best
mean_absolute_errors(new_submission_one_stage.round(4), train_df)

(0.06160429977898345,
 0.06000580503833507,
 0.07284450833812528,
 0.063550901428236994)

In [30]:
new_submission_one_stage.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/catboost3.csv.gz",
                     compression='gzip', index=False)

### Two-stage model

In [43]:
test_predictions = pd.read_csv(maindir + "/data/properties_2016_with_2017_tax.csv", usecols=['parcelid'])

for pair in models:
    current_model_name,current_model = pair
    print("Current model: %s" % current_model_name)
    
    for key,val in data_indices.items():
        type_of_zestimate, ix = key, val

        # preprocess current training data
        current_train = X_train.iloc[ix,]
        current_y = y_train.iloc[ix,]

        for month in [10]:            
            print("%s_%d" % (type_of_zestimate, month))
            
            num_ensembles = 5
            y_pred = Series(np.zeros(2985217), name = "%s_%d_%s" % (current_model_name, month,type_of_zestimate))
            for i in range(num_ensembles):            
                # get a clone of the model and fit the current training data
                print(i+1)
                reg = clone(current_model)
                reg.set_params(random_seed=i)
                print("...fitting model")
                reg.fit(current_train, current_y,cat_features=cat_feature_inds)
                print("...obtaining predictions on test set")
                # obtain predictions on test set
                y_pred = y_pred + generate_regression_preds(reg, model_name="%s_%d_%s" % (current_model_name,
                                                             month,type_of_zestimate), month = month)
            y_pred = y_pred / num_ensembles
            test_predictions = pd.concat([test_predictions, y_pred], axis=1)

Current model: catboost
over_10
1
...fitting model
...obtaining predictions on test set
2
...fitting model
...obtaining predictions on test set
3
...fitting model
...obtaining predictions on test set
4
...fitting model
...obtaining predictions on test set
5
...fitting model
...obtaining predictions on test set
under_10
1
...fitting model
...obtaining predictions on test set
2
...fitting model
...obtaining predictions on test set
3
...fitting model
...obtaining predictions on test set
4
...fitting model
...obtaining predictions on test set
5
...fitting model
...obtaining predictions on test set


In [44]:
test_predictions.head()

Unnamed: 0,parcelid,catboost_10_over,catboost_10_under
0,10754147,0.086996,-0.071177
1,10759547,0.085959,-0.07101
2,10843547,0.093681,-0.111239
3,10859147,0.078073,-0.096214
4,10879947,0.062657,-0.090571


In [24]:
test_predictions.to_csv("/home/anerdi/Desktop/Zillow/twostagemodel/catboost-two-stage-preds.csv")

In [46]:
overestimate_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_ann_rfs_xgbs_lgbms_20162017.csv.gz")
overestimate_probabilities.rename(columns={'stacked_pred':'overestimate_prob'}, inplace=True)

In [47]:
overestimate_probabilities.head()

Unnamed: 0,parcelid,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,xgb1_overestimate_prob,xgb2_overestimate_prob,lgbm1_overestimate_prob,lgbm2_overestimate_prob,overestimate_prob
0,10754147,0.466125,0.497368,0.560124,0.462231,0.479722,0.511108,0.650032,0.515646
1,10759547,0.412405,0.500739,0.420662,0.525333,0.527149,0.543325,0.646905,0.553857
2,10843547,0.527465,0.636537,0.535855,0.411728,0.52214,0.544722,0.501281,0.46654
3,10859147,0.665014,0.648656,0.456073,0.580416,0.570832,0.590595,0.576605,0.581211
4,10879947,0.477125,0.524182,0.483156,0.514524,0.50993,0.515008,0.404796,0.483875


#### Merging Results

In [48]:
test_predictions = pd.merge(test_predictions, overestimate_probabilities, on='parcelid')

In [49]:
for pair in models:
    current_model_name, current_model = pair
    # combine over and under to get prediction
    for month in [10]:
        test_predictions['{0}_{1}'.format(current_model_name, month)] = (test_predictions['%s_%d_over' % (current_model_name, month)]*test_predictions['overestimate_prob'] 
                + test_predictions['%s_%d_under' % (current_model_name, month)]*(1 - test_predictions['overestimate_prob']))

In [50]:
test_predictions.head()

Unnamed: 0,parcelid,catboost_10_over,catboost_10_under,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,xgb1_overestimate_prob,xgb2_overestimate_prob,lgbm1_overestimate_prob,lgbm2_overestimate_prob,overestimate_prob,catboost_10
0,10754147,0.086996,-0.071177,0.466125,0.497368,0.560124,0.462231,0.479722,0.511108,0.650032,0.515646,0.010384
1,10759547,0.085959,-0.07101,0.412405,0.500739,0.420662,0.525333,0.527149,0.543325,0.646905,0.553857,0.015928
2,10843547,0.093681,-0.111239,0.527465,0.636537,0.535855,0.411728,0.52214,0.544722,0.501281,0.46654,-0.015636
3,10859147,0.078073,-0.096214,0.665014,0.648656,0.456073,0.580416,0.570832,0.590595,0.576605,0.581211,0.005083
4,10879947,0.062657,-0.090571,0.477125,0.524182,0.483156,0.514524,0.50993,0.515008,0.404796,0.483875,-0.016428


In [51]:
model_name = 'catboost'
new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                           '201610':test_predictions['%s_10' % model_name],
                           '201611':test_predictions['%s_10' % model_name],
                           '201612':test_predictions['%s_10' % model_name],
})
new_submission['201710'] = 0
new_submission['201711'] = 0
new_submission['201712'] = 0

In [52]:
new_submission.head()

Unnamed: 0,201610,201611,201612,ParcelId,201710,201711,201712
0,0.010384,0.010384,0.010384,10754147,0,0,0
1,0.015928,0.015928,0.015928,10759547,0,0,0
2,-0.015636,-0.015636,-0.015636,10843547,0,0,0
3,0.005083,0.005083,0.005083,10859147,0,0,0
4,-0.016428,-0.016428,-0.016428,10879947,0,0,0


In [53]:
train_df['month'] = train_df['transaction_month']

In [54]:
# two-stage with xgb
mean_absolute_errors(new_submission.round(4), train_df)

(0.06220480208961224,
 0.06115547645125963,
 0.07376003450258775,
 0.064332931397799151)

In [55]:
# current best
mean_absolute_errors(new_submission_one_stage.round(4), train_df)

(0.06168199718706048,
 0.060239156626506224,
 0.07304933870040252,
 0.063687754624209822)

In [41]:
# current best
mean_absolute_errors(new_submission.round(4), train_df)

(0.06174942736588318,
 0.06084288061336259,
 0.07342547441058092,
 0.063932673846874372)

In [57]:
new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_stage1_stacked_annrfsxgbs_stage2_catboost_201617.csv.gz", index=False,
                     compression='gzip')