In [4]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import datetime

#Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

# sklearn stuff
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer, PolynomialFeatures
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from scipy.stats import mode,rankdata

import feature_pipelines as pipes

### Submission Functions 

In [5]:
def generate_regression_preds(reg, model_name='pred_logerror', month = -1):
    reg_preds = None
    # change month of properties
    properties['month'] = month
    for i in range(int(properties.shape[0] / 100000)):   
        # get current test features
        current_test_feats = feature_pipeline.transform(properties.iloc[i*100000:(i+1)*100000])

        # predict on current test obs
        current_preds = Series(reg.predict(current_test_feats), name=model_name,
                              index = np.arange(i*100000,(i+1)*100000))

        if reg_preds is not None:
            reg_preds = pd.concat([reg_preds, current_preds])
        else:
            reg_preds = current_preds

    #  fencepost problem
    current_test_feats = feature_pipeline.transform(properties.iloc[2900000:])
    current_preds = Series(reg.predict(current_test_feats), name=model_name,
                          index = np.arange(2900000,2985217))
    reg_preds = pd.concat([reg_preds, current_preds])
    
    del properties['month']
    return reg_preds

In [6]:
def generate_submissions(oct_model,nov_model,dec_model,name='new_submission',logy=True):
    """
    This function creates the submission file for the public leaderboard predictions.
    Three already fitted models, one for each of the predicting time points, is required.
    """
    submission_df = DataFrame()
    for i in range(int(properties.shape[0] / 100000)):
        all_feats = full_pipeline.transform(properties.iloc[i*100000:(i+1)*100000])
        foo = properties.iloc[i*100000:(i+1)*100000][['parcelid']].reset_index(drop=True)
        if logy:
            foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                            '201611': nov_model.predict(all_feats),
                                                            '201612': dec_model.predict(all_feats)})], axis=1)
        else:
            foo = pd.concat([foo, DataFrame({'201610': np.log(oct_model.predict(all_feats)),
                                                            '201611': np.log(nov_model.predict(all_feats)),
                                                            '201612': np.log(dec_model.predict(all_feats))})], axis=1)
        submission_df = pd.concat([submission_df, foo], ignore_index=True)

    #  fencepost problem
    all_feats = full_pipeline.transform(properties.iloc[2900000:])
    foo = properties.iloc[2900000:][['parcelid']].reset_index(drop=True)
    foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                    '201611': nov_model.predict(all_feats),
                                                    '201612': dec_model.predict(all_feats)})], axis=1)
    submission_df = pd.concat([submission_df, foo], ignore_index=True)
    
    submission_df['201710'] = 0
    submission_df['201711'] = 0
    submission_df['201712'] = 0
    
    submission_df.rename(columns={'parcelid':'ParcelId'}, inplace=True)    
#     submission_df[['201610','201611','201612','201710','201711','201712']]= submission_df[['201610','201611','201612',
#                                                                                            '201710','201711','201712']].round(4)
    # unit test
    submission_df.drop_duplicates(inplace=True)
    assert submission_df.shape[0] == properties.shape[0]
    # write to .csv
    submission_df[['ParcelId','201610','201611','201612',
                  '201710','201711','201712']].to_csv(name + ".gz", index=False, float_format='%.4g', compression='gzip')
    return submission_df

In [7]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

### Reading in data 

In [61]:
maindir = "/home/anerdi/Desktop/Zillow"

logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv")

In [62]:
#proportion of living area
properties['N-LivingAreaProp'] = properties['calculatedfinishedsquarefeet']/properties['lotsizesquarefeet']

properties['N-NonLivingAreaProp'] = properties['garagetotalsqft']/properties['lotsizesquarefeet']

#Ratio of the built structure value to land area
properties['N-ValueProp'] = properties['structuretaxvaluedollarcnt']/properties['landtaxvaluedollarcnt']

#Ratio of tax of property over parcel
properties['N-ValueRatio'] = properties['taxvaluedollarcnt']/properties['taxamount']

# Pool
properties['poolsizesum'] = properties['poolsizesum'].fillna(0)
# properties['Pool'] = (properties['poolsizesum'] > 0).astype(int)
properties['Pool'] = (properties['pooltypeid2'].fillna(0) + properties['pooltypeid7'].fillna(0)).astype(int)

properties['regionidcounty'] = properties['regionidcounty'].fillna(9999)
properties['regionidcity'] = properties['regionidcity'].fillna(-1)

# some more feature engineering
properties['age'] = 2017 - properties['yearbuilt']
properties['additional_rooms_count'] = np.maximum((properties['roomcnt'].values 
                                                   - properties['calculatedbathnbr'].values
                                                   - properties['bedroomcnt'].values),0)

# higher order terms
properties['bedroomcnt2'] = properties['bedroomcnt'] ** 2
properties['bedroomcnt3'] = properties['bedroomcnt'] ** 3

In [7]:
# properties without regionidcounty are missing all features
# properties['regionidcounty'].isnull().sum()

### Read in outlier detection results by Gacia and Qishu 

In [8]:
# outliers = np.loadtxt("exclude_index.csv", dtype=int)
# data = data.iloc[np.setdiff1d(np.arange(data.shape[0]), outliers),:].reset_index(drop=True)

### Load in Zestimate Type preds

In [9]:
# zestimate_type = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/zestimatetype_probs_rf_balanced.csv.gz")
# zestimate_type['zestimate_type'] = np.argmax(zestimate_type[['1','2','3']].values, axis=1) + 1

# assert (properties['parcelid'] == zestimate_type['parcelid']).all()
# properties['zestimate_type'] = zestimate_type['zestimate_type']

### Data  Preprocessing Pipeline

In [83]:
# Setup variables considered in the model

# numerical variables
num_atts = ['garagetotalsqft', 
            'calculatedbathnbr',
            'structuretaxvaluedollarcnt',
            'bedroomcnt',
            'age']

num_atts_to_interact = ['calculatedfinishedsquarefeet', 'lotsizesquarefeet']

# categorical varaibles
cat_atts = ['airconditioningtypeid',
            'heatingorsystemtypeid',
            'Pool',
            'propertylandusetypeid',
            'taxdelinquencyflag',
            'architecturalstyletypeid',
            'regionidcounty',
            'month']

# Dictionary of categorical variables and their default levels
cat_dict = {key:value for key,value in {'airconditioningtypeid':[-1] + list(range(1,14)),
           'architecturalstyletypeid':[-1] + list(range(1,28)),
           'buildingclasstypeid':[-1] + list(range(1,6)),
            'heatingorsystemtypeid':[-1] + list(range(1,26)),
            'pooltypeid10': list(range(-1,2)),
            'pooltypeid2': list(range(-1,2)),
            'pooltypeid7': list(range(-1,2)),
            'Pool': [0,1],
            'propertylandusetypeid': [-1, 31,46,47,246,247,248,260,261,262,263,264,265,266,267,268,269,270,271,
                                     273,274,275,276,279,290,291],
            'regionidcounty': [-1]+ [2061,3101,1286],
            'regionidcity':  properties.regionidcity.unique().tolist(),
            'month': [-1] + list(range(1,13)),
            'zestimate_type': [1,2,3],
            'storytypeid':[-1] + list(range(1,36)),
            'typeconstructiontypeid':[-1] + list(range(1,19)),
            'yearbuilt': [-1] + list(range(1885,2018)),
            'fireplaceflag': [-1] + ['True','False'],
            'taxdelinquencyflag': [-1] + ['Y','N']
           }.items() if key in cat_atts}

# pairs to interact (x1,x2) where x1 is categorical and x2 is continuous
interact_pairs = [
                ('regionidcounty','calculatedbathnbr'),
                ('regionidcounty','bedroomcnt'),
                ('regionidcounty','structuretaxvaluedollarcnt'),
                ('regionidcounty','age')
    ]

In [84]:
# Categorical pipeline
cat_pipeline = Pipeline([
        ('select_and_dummify', pipes.DF_Selector_GetDummies(cat_dict)),
    ])

# Numerical pipeline
num_pipeline = Pipeline([
        ('selector', pipes.DataFrameSelector(num_atts)),
        ('imputer', Imputer()),
        ('scaler', StandardScaler())
    ])

# interaction pipelines
cat_interact_pipeline = Pipeline([
        ('dummify_and_interact',pipes.Dummify_and_Interact(interact_pairs,cat_dict)),
    ])

num_interact_pipeline = Pipeline([
        ('selector', pipes.DataFrameSelector(num_atts_to_interact)),
        ('imputer', Imputer()),
        ('polynomial_features', PolynomialFeatures(2, include_bias=False)),
        ('scaler', StandardScaler())
    ])

# Full pipeline
feature_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("num_interact_pipeline", num_interact_pipeline),
        ("cat_pipeline", cat_pipeline),
        ("cat_interact_pipeline", cat_interact_pipeline)
    ])

In [85]:
# impute missing num_atts per regionid
for countyid in properties.regionidcounty.unique():
    # setup condition
    cond = properties['regionidcounty'] == countyid
    indices = np.where(cond)[0]
    # impute values based on region
    if countyid != 9999:
        properties.loc[indices,num_atts] = (properties.loc[indices,num_atts]
                                .fillna(properties.loc[indices,num_atts]
                                .apply(np.mean)))
    else:
        properties.loc[indices,num_atts] = (properties.loc[indices,num_atts]
                                            .fillna(properties[num_atts]
                                            .apply(np.mean)))

In [86]:
assert properties[num_atts].isnull().any().any() == False

In [87]:
# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

### Some EDA

In [1]:
# properties.groupby('regionidzip')['regionidcounty'].unique().apply(lambda x: mode(x)[0])[properties.groupby('regionidzip')['regionidcounty'].nunique() == 2]

In [2]:
# properties.groupby('regionidzip')['regionidcounty'].apply(lambda x: mode(list(x)))[properties.groupby('regionidzip')['regionidcounty'].nunique() == 2]

In [62]:
# data['zip_rank'] = rankdata(data['regionidzip'], method='dense')
# data['city_rank'] = rankdata(data['regionidcity'], method='dense')
# data['neighbor_rank'] = rankdata(data['regionidneighborhood'], method='dense')

In [3]:
# plt.figure(figsize=(10,10))
# plt.scatter(data['regionidcounty'],data.logerror)

In [88]:
feature_pipeline.fit(data) #fitting the pipeline to the entire properties dataframe

FeatureUnion(n_jobs=1,
       transformer_list=[('num_pipeline', Pipeline(memory=None,
     steps=[('selector', DataFrameSelector(desired_cols=['garagetotalsqft', 'calculatedbathnbr', 'structuretaxvaluedollarcnt', 'bedroomcnt', 'age'])), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), (...', 'bedroomcnt'), ('regionidcounty', 'structuretaxvaluedollarcnt'), ('regionidcounty', 'age')]))]))],
       transformer_weights=None)

In [89]:
import gc

In [90]:
gc.collect()

1409

## Splitting the Training Set

In [40]:
ix_overestimated = np.where(data['logerror'] >= 0)[0]
ix_underestimated = np.where(data['logerror'] < 0)[0]
data_indices = {"over": ix_overestimated, "under": ix_underestimated}

In [41]:
assert ix_overestimated.shape[0] + ix_underestimated.shape[0] == data.shape[0]

## Training Elastic Net

In [42]:
from sklearn.linear_model import ElasticNet, Lars, HuberRegressor
from sklearn.base import clone

import warnings
warnings.filterwarnings("ignore")

In [43]:
models = [
    ("ridge",ElasticNet(alpha=1.25, l1_ratio = 0, max_iter=1000)),
#     ("enet", ElasticNet(alpha=0.025, l1_ratio = 0.5, max_iter=1000)),
#     ("lasso", ElasticNet(alpha=0.025, l1_ratio = 1, max_iter=1000)),
#     ("larm", Lars(n_nonzero_coefs = 1)),
#     ("huber", HuberRegressor())
]

In [91]:
test_predictions = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv", usecols=['parcelid'])

for pair in models:
    current_model_name,current_model = pair
    print("Current model: %s" % current_model_name)
    
    for key,val in data_indices.items():
        type_of_zestimate, ix = key, val

        # preprocess current training data
        current_traindata = data.iloc[ix,]

        # get a clone of the model and fit the current training data
        reg = clone(current_model)
        reg.fit(feature_pipeline.transform(current_traindata), current_traindata['logerror'])

        for month in [10,11,12]:
            # obtain predictions on test set
            reg_preds = generate_regression_preds(reg, model_name="%s_%d_%s" % (current_model_name,month,type_of_zestimate),
                                                month = month)
            test_predictions = pd.concat([test_predictions, reg_preds], axis=1)

Current model: ridge


In [92]:
test_predictions.head()

Unnamed: 0,parcelid,ridge_10_over,ridge_11_over,ridge_12_over,ridge_10_under,ridge_11_under,ridge_12_under
0,10754147,0.081578,0.081653,0.08189,-0.082842,-0.082959,-0.083049
1,10759547,0.080076,0.080151,0.080388,-0.080245,-0.080362,-0.080452
2,10843547,1.631616,1.631691,1.631928,-0.819814,-0.819932,-0.820021
3,10859147,0.109882,0.109957,0.110194,-0.114627,-0.114745,-0.114834
4,10879947,0.09185,0.091926,0.092162,-0.095312,-0.095429,-0.095519


In [83]:
test_predictions.to_csv("/home/anerdi/Desktop/Zillow/twostagemodel/two_stage_preds_linear_models_age.csv.gz",
                       compression='gzip', index=False)

In [74]:
overestimate_probabilities = pd.read_csv("/home/anerdi/Desktop/Zillow/twostagemodel/overestimate_probs_stacked_ann_rfs.csv.gz")
overestimate_probabilities.rename(columns={'stacked_pred':'overestimate_prob'}, inplace=True)

In [75]:
overestimate_probabilities.head()

Unnamed: 0,parcelid,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,overestimate_prob
0,10754147,0.469632,0.466012,0.495383,0.479207
1,10759547,0.406392,0.555562,0.524675,0.475759
2,10843547,0.880731,0.548264,0.548752,0.732975
3,10859147,0.569588,0.663067,0.543329,0.6021
4,10879947,0.540791,0.519636,0.485341,0.531297


#### Merging Results

In [93]:
test_predictions = pd.merge(test_predictions, overestimate_probabilities, on='parcelid')

In [94]:
for pair in models:
    current_model_name, current_model = pair
    # combine over and under to get prediction
    for month in [10,11,12]:
        test_predictions['%s_%d' % (current_model_name, month)] = (
                test_predictions['%s_%d_over' % (current_model_name, month)]*test_predictions['overestimate_prob'] 
                + test_predictions['%s_%d_under' % (current_model_name, month)]*(1 - test_predictions['overestimate_prob']))

In [95]:
test_predictions.head()

Unnamed: 0,parcelid,ridge_10_over,ridge_11_over,ridge_12_over,ridge_10_under,ridge_11_under,ridge_12_under,ann_overestimate_prob,rf2_overestimate_prob,rf3_overestimate_prob,overestimate_prob,ridge_10,ridge_11,ridge_12
0,10754147,0.081578,0.081653,0.08189,-0.082842,-0.082959,-0.083049,0.469632,0.466012,0.495383,0.479207,-0.004051,-0.004076,-0.004009
1,10759547,0.080076,0.080151,0.080388,-0.080245,-0.080362,-0.080452,0.406392,0.555562,0.524675,0.475759,-0.003971,-0.003996,-0.003931
2,10843547,1.631616,1.631691,1.631928,-0.819814,-0.819932,-0.820021,0.880731,0.548264,0.548752,0.732975,0.977023,0.977047,0.977197
3,10859147,0.109882,0.109957,0.110194,-0.114627,-0.114745,-0.114834,0.569588,0.663067,0.543329,0.6021,0.02055,0.020548,0.020655
4,10879947,0.09185,0.091926,0.092162,-0.095312,-0.095429,-0.095519,0.540791,0.519636,0.485341,0.531297,0.004127,0.004112,0.004196


In [96]:
model_name = 'ridge'
new_submission = DataFrame({'ParcelId': test_predictions['parcelid'],
                           '201610':test_predictions['%s_10' % model_name],
                           '201611':test_predictions['%s_11' % model_name],
                           '201612':test_predictions['%s_12' % model_name],
})
new_submission['201710'] = 0
new_submission['201711'] = 0
new_submission['201712'] = 0

In [97]:
new_submission.head()

Unnamed: 0,201610,201611,201612,ParcelId,201710,201711,201712
0,-0.004051,-0.004076,-0.004009,10754147,0,0,0
1,-0.003971,-0.003996,-0.003931,10759547,0,0,0
2,0.977023,0.977047,0.977197,10843547,0,0,0
3,0.02055,0.020548,0.020655,10859147,0,0,0
4,0.004127,0.004112,0.004196,10879947,0,0,0


In [78]:
# two-stage with lasso
mean_absolute_errors(new_submission.round(4), data)

(0.06203341370303383,
 0.06111768893756846,
 0.07369401955146644,
 0.064211554671037174)

In [32]:
# two-stage with lasso
mean_absolute_errors(new_submission.round(4), data)

(0.062150452079565835,
 0.06117146768893759,
 0.07377130534790113,
 0.0643069772886911)

In [61]:
# two-stage with enet
mean_absolute_errors(new_submission.round(4), data)

(0.06201786216596326,
 0.061116046002190644,
 0.07367130534790112,
 0.064197518145633251)

In [39]:
# two-stage with enet
mean_absolute_errors(new_submission.round(4), data)

(0.06215055254169163,
 0.06117190580503834,
 0.07377107533064983,
 0.06430708265043307)

In [44]:
# two-stage with larm
mean_absolute_errors(new_submission.round(4), data)

(0.0621774763914004,
 0.06121692223439218,
 0.073834847613571,
 0.064345375790213044)

In [46]:
# two-stage with huber
mean_absolute_errors(new_submission.round(4), data)

(0.06323550331525006,
 0.062391949616648505,
 0.0748250718803912,
 0.06541461016155467)

In [98]:
# two-stage with ridge
mean_absolute_errors(new_submission.round(4), data)

(0.0620110709262608,
 0.06117809419496172,
 0.07365485911443352,
 0.064203476937485349)

In [123]:
# two-stage with ridge
mean_absolute_errors(new_submission.round(4), data)

(0.06200319469559975,
 0.06116593647316538,
 0.07366687751581366,
 0.064198735659096215)

In [99]:
new_submission.round(4).to_csv("/home/anerdi/Desktop/Zillow/submissions/two_stage_stage1_annrfs_stage2_ridge_city.csv.gz", index=False,
                     compression='gzip')