In [2]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import feature_pipelines as pipes

### Submission Functions 

In [24]:
def generate_regression_preds(reg, model_name='pred_logerror', month=-1):
    reg_preds = None
    for i in range(int(properties.shape[0] / 100000)):   
        # get current test features
        current_test_feats = X_test.iloc[i*100000:(i+1)*100000]

        # predict on current test obs
        current_preds = Series(reg.predict(current_test_feats), name=model_name,
                              index = np.arange(i*100000,(i+1)*100000))

        if reg_preds is not None:
            reg_preds = pd.concat([reg_preds, current_preds])
        else:
            reg_preds = current_preds

    #  fencepost problem
    current_test_feats = X_test.iloc[2900000:]
    current_preds = Series(reg.predict(current_test_feats), name=model_name,
                          index = np.arange(2900000,2985217))
    reg_preds = pd.concat([reg_preds, current_preds])
    return reg_preds

In [4]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [5]:
maindir = "/home/anerdi/Desktop/Zillow"

# train_df = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv", parse_dates=['transactiondate'], low_memory=False)
train_df = pd.read_csv/"home/anerdi/Desktop/Zillow/data/", parse_dates=['transactiondate'], low_memory=False)
test_df = pd.read_csv(maindir + "/data/sample_submission.csv", low_memory=False)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", low_memory=False)
# field is named differently in submission
test_df['parcelid'] = test_df['ParcelId']

### Data  Preprocessing Pipeline

In [6]:
# similar to the1owl
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

In [7]:
train_df = add_date_features(train_df)
train_df = train_df.merge(properties, how='left', on='parcelid')
test_df = test_df.merge(properties, how='left', on='parcelid')
print("Train: ", train_df.shape)
print("Test: ", test_df.shape)

Train:  (90275, 63)
Test:  (2985217, 65)


### 0.a) Remove missing data fields

In [8]:
missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print("We exclude: %s" % exclude_missing)
print(len(exclude_missing))

We exclude: ['architecturalstyletypeid', 'basementsqft', 'buildingclasstypeid', 'decktypeid', 'finishedsquarefeet13', 'finishedsquarefeet6', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'storytypeid', 'typeconstructiontypeid', 'yardbuildingsqft26', 'fireplaceflag', 'taxdelinquencyflag', 'taxdelinquencyyear']
15


### 0.b) Remove data that is always the same

In [9]:
# exclude where we only have one unique value :D
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % exclude_unique)
print(len(exclude_unique))

We exclude: ['transaction_year', 'buildingclasstypeid', 'decktypeid', 'hashottuborspa', 'poolcnt', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'fireplaceflag', 'assessmentyear', 'taxdelinquencyflag']
12


### 1.a) Define training features

In [10]:
exclude_other = ['parcelid', 'logerror']  # for indexing/training only
# do not know what this is LARS, 'SHCG' 'COR2YY' 'LNR2RPD-R3' ?!?
exclude_other.append('propertyzoningdesc')
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % train_features)
print(len(train_features))

We use these for training: ['transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet15', 'finishedsquarefeet50', 'fips', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yardbuildingsqft17', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'landtaxvaluedollarcnt', 'taxamount', 'censustractandblock']
40


### 1.b) Define which of these training features are categorical

In [11]:
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

Cat features are: ['transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt']


### 1.c) Fill missing values

In [12]:
# some out of range int is a good choice
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

## Splitting the Training Set

In [13]:
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

(90275, 40) (90275,)


In [14]:
ix_overestimated = np.where(y_train >= 0)[0]
ix_underestimated = np.where(y_train < 0)[0]
data_indices = {"over": ix_overestimated, "under": ix_underestimated}
data_indices = {"over": ix_overestimated, "under": ix_underestimated}

In [16]:
assert ix_overestimated.shape[0] + ix_underestimated.shape[0] == X_train.shape[0]

In [17]:
ix_overestimated.shape

(50608,)

In [18]:
ix_underestimated.shape

(39667,)

In [19]:
test_df['transactiondate'] = pd.Timestamp('2016-12-01')  # Dummy
test_df = add_date_features(test_df)
X_test = test_df[train_features]
print(X_test.shape)

(2985217, 40)


## Training CatBoost

In [20]:
from catboost import CatBoostRegressor
from sklearn.base import clone

import warnings
warnings.filterwarnings("ignore")

In [21]:
models = [
    ("catboost",
     CatBoostRegressor(
        iterations=200, learning_rate=0.03,
        depth=6, l2_leaf_reg=3,
        loss_function='MAE',
        eval_metric='MAE'))
]

### One-stage model

In [53]:
test_predictions_one_stage = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

for month in [10]:            
    num_ensembles = 5
    y_pred = Series(np.zeros(2985217), name = "catboost")
    for i in range(num_ensembles):            
        # get a clone of the model and fit the current training data
        print(i+1)
        reg = clone(current_model)
        reg.set_params(random_seed=i)
        print("...fitting model")
        reg.fit(X_train, y_train,cat_features=cat_feature_inds)
        print("...obtaining predictions on test set")
        # obtain predictions on test set
        y_pred = y_pred + generate_regression_preds(reg, model_name="catboost")
    y_pred = y_pred / num_ensembles
    test_predictions_one_stage = pd.concat([test_predictions_one_stage, y_pred], axis=1)

over_10
1
...fitting model
...obtaining predictions on test set
2
...fitting model
...obtaining predictions on test set
3
...fitting model
...obtaining predictions on test set
4
...fitting model
...obtaining predictions on test set
5
...fitting model
...obtaining predictions on test set


In [58]:
model_name = 'catboost'
new_submission_one_stage = DataFrame({'ParcelId': test_predictions_one_stage['parcelid'],
                           '201610':test_predictions_one_stage['%s' % model_name],
                           '201611':test_predictions_one_stage['%s' % model_name],
                           '201612':test_predictions_one_stage['%s' % model_name],
})
new_submission_one_stage['201710'] = 0
new_submission_one_stage['201711'] = 0
new_submission_one_stage['201712'] = 0

In [61]:
# current best
mean_absolute_errors(new_submission_one_stage.round(4), train_df)

(0.061506992163954166,
 0.06014846659364756,
 0.07282127659574475,
 0.063519971903535519)

In [None]:
new_submission_one_stage.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/starter_catboost.csv.gz",
                     compression='gzip', index=False)

### Two-stage model

In [43]:
test_predictions = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

for pair in models:
    current_model_name,current_model = pair
    print("Current model: %s" % current_model_name)
    
    for key,val in data_indices.items():
        type_of_zestimate, ix = key, val

        # preprocess current training data
        current_train = X_train.iloc[ix,]
        current_y = y_train.iloc[ix,]

        for month in [10]:            
            print("%s_%d" % (type_of_zestimate, month))
            
            num_ensembles = 5
            y_pred = Series(np.zeros(2985217), name = "%s_%d_%s" % (current_model_name, month,type_of_zestimate))
            for i in range(num_ensembles):            
                # get a clone of the model and fit the current training data
                print(i+1)
                reg = clone(current_model)
                reg.set_params(random_seed=i)
                print("...fitting model")
                reg.fit(current_train, current_y,cat_features=cat_feature_inds)
                print("...obtaining predictions on test set")
                # obtain predictions on test set
                y_pred = y_pred + generate_regression_preds(reg, model_name="%s_%d_%s" % (current_model_name,
                                                             month,type_of_zestimate), month = month)
            y_pred = y_pred / num_ensembles
            test_predictions = pd.concat([test_predictions, y_pred], axis=1)

Current model: catboost
under_10
1
...fitting model
...obtaining predictions on test set
2
...fitting model
...obtaining predictions on test set
3
...fitting model
...obtaining predictions on test set
4
...fitting model
...obtaining predictions on test set
5
...fitting model
...obtaining predictions on test set
over_10
1
...fitting model
...obtaining predictions on test set
2
...fitting model
...obtaining predictions on test set
3
...fitting model
...obtaining predictions on test set
4
...fitting model
...obtaining predictions on test set
5
...fitting model
...obtaining predictions on test set


In [44]:
test_predictions.head()

Unnamed: 0,parcelid,catboost_10_under,catboost_10_over
0,10754147,-0.075926,0.079186
1,10759547,-0.070163,0.080409
2,10843547,-0.100376,0.101847
3,10859147,-0.09154,0.089289
4,10879947,-0.066309,0.077066


In [24]:
test_predictions.to_csv("/home/anerdi/Desktop/Zillow/twostagemodel/catboost-two-stage-preds.csv")

In [27]:
overestimate_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_ann_rfs_xgbs.csv.gz")
overestimate_probabilities.rename(columns={'stacked_pred':'overestimate_prob'}, inplace=True)

In [28]:
overestimate_probabilities.head()

Unnamed: 0,parcelid,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,xgb1_overestimate_prob,xgb2_overestimate_prob,overestimate_prob
0,10754147,0.469632,0.466012,0.495383,0.658496,0.665265,0.616614
1,10759547,0.406392,0.555562,0.524675,0.536781,0.519523,0.499032
2,10843547,0.880731,0.548264,0.548752,0.430116,0.485831,0.546513
3,10859147,0.569588,0.663067,0.543329,0.67905,0.54776,0.620737
4,10879947,0.540791,0.519636,0.485341,0.50273,0.516258,0.510328


#### Merging Results

In [45]:
test_predictions = pd.merge(test_predictions, overestimate_probabilities, on='parcelid')

In [46]:
for pair in models:
    current_model_name, current_model = pair
    # combine over and under to get prediction
    for month in [10]:
        test_predictions['{0}_{1}'.format(current_model_name, month)] = (test_predictions['%s_%d_over' % (current_model_name, month)]*test_predictions['overestimate_prob'] 
                + test_predictions['%s_%d_under' % (current_model_name, month)]*(1 - test_predictions['overestimate_prob']))

In [47]:
test_predictions.head()

Unnamed: 0,parcelid,catboost_10_under,catboost_10_over,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,xgb1_overestimate_prob,xgb2_overestimate_prob,overestimate_prob,catboost_10
0,10754147,-0.075926,0.079186,0.469632,0.466012,0.495383,0.658496,0.665265,0.616614,0.019718
1,10759547,-0.070163,0.080409,0.406392,0.555562,0.524675,0.536781,0.519523,0.499032,0.004977
2,10843547,-0.100376,0.101847,0.880731,0.548264,0.548752,0.430116,0.485831,0.546513,0.010141
3,10859147,-0.09154,0.089289,0.569588,0.663067,0.543329,0.67905,0.54776,0.620737,0.020707
4,10879947,-0.066309,0.077066,0.540791,0.519636,0.485341,0.50273,0.516258,0.510328,0.00686


In [48]:
model_name = 'catboost'
new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                           '201610':test_predictions['%s_10' % model_name],
                           '201611':test_predictions['%s_10' % model_name],
                           '201612':test_predictions['%s_10' % model_name],
})
new_submission['201710'] = 0
new_submission['201711'] = 0
new_submission['201712'] = 0

In [49]:
new_submission.head()

Unnamed: 0,201610,201611,201612,ParcelId,201710,201711,201712
0,0.019718,0.019718,0.019718,10754147,0,0,0
1,0.004977,0.004977,0.004977,10759547,0,0,0
2,0.010141,0.010141,0.010141,10843547,0,0,0
3,0.020707,0.020707,0.020707,10859147,0,0,0
4,0.00686,0.00686,0.00686,10879947,0,0,0


In [40]:
train_df['month'] = train_df['transaction_month']

In [51]:
# two-stage with xgb
mean_absolute_errors(new_submission.round(4), train_df)

(0.061815873015872776,
 0.060846330777656034,
 0.07341719378953428,
 0.063970440177944135)

In [59]:
# current best
mean_absolute_errors(new_submission_one_stage.round(4), train_df)

(0.061506992163954166,
 0.06014846659364756,
 0.07282127659574475,
 0.063519971903535519)

In [41]:
# current best
mean_absolute_errors(new_submission.round(4), train_df)

(0.06174942736588318,
 0.06084288061336259,
 0.07342547441058092,
 0.063932673846874372)

In [52]:
new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_stage1_stacked_annrfsxgbs_stage2_catboost.csv.gz", index=False,
                     compression='gzip')