In [163]:
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import os
import gc
import datetime
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error, precision_score

In [2]:
maindir = "/home/anerdi/Desktop/Zillow"

In [55]:
# linux
logerror = pd.read_csv(maindir + "/data/train_2016_v2.csv/train_2016_v2.csv")
logerror['weeknumber'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])
logerror['month'] = logerror['transactiondate'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').month)
properties = pd.read_csv(maindir + "/data/properties_2016.csv/properties_2016.csv")
test_parcels = pd.read_csv(maindir + "/data/sample_submission.csv", usecols = ['ParcelId'])
test_parcels.rename(columns={'ParcelId':'parcelid'}, inplace=True)

# join on parcel id
data = pd.merge(properties,logerror[['parcelid','logerror','month']], on='parcelid')

  interactivity=interactivity, compiler=compiler, result=result)


## Simple Data Preprocessing

In [4]:
from sklearn.preprocessing import LabelEncoder, Imputer 

In [5]:
data.columns

Index(['parcelid', 'airconditioningtypeid', 'architecturalstyletypeid',
       'basementsqft', 'bathroomcnt', 'bedroomcnt', 'buildingclasstypeid',
       'buildingqualitytypeid', 'calculatedbathnbr', 'decktypeid',
       'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet',
       'finishedsquarefeet12', 'finishedsquarefeet13', 'finishedsquarefeet15',
       'finishedsquarefeet50', 'finishedsquarefeet6', 'fips', 'fireplacecnt',
       'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'hashottuborspa',
       'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet',
       'poolcnt', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertycountylandusecode', 'propertylandusetypeid',
       'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt',
       'storytypeid', 'threequarterbathnbr', 'typeconstructiontypeid',
       'unitcnt', 'yardbuildingsqft17', 'yardbuildin

In [26]:
# Setup variables considered in the model

# numerical variables
num_atts = ['bathroomcnt','bedroomcnt','buildingqualitytypeid','calculatedbathnbr','finishedfloor1squarefeet',
           'calculatedfinishedsquarefeet','finishedsquarefeet12','finishedsquarefeet13',
           'finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6','fireplacecnt',
           'fullbathcnt','garagecarcnt','garagetotalsqft','latitude','longitude','lotsizesquarefeet',
           'poolcnt','poolsizesum','censustractandblock','roomcnt','threequarterbathnbr','unitcnt',
           'yardbuildingsqft17','yardbuildingsqft26','numberofstories',
            'structuretaxvaluedollarcnt','taxvaluedollarcnt','landtaxvaluedollarcnt','taxamount']

# categorical variables
cat_atts = ['airconditioningtypeid','architecturalstyletypeid',
           'buildingclasstypeid','heatingorsystemtypeid','pooltypeid10','pooltypeid2',
            'pooltypeid7','propertylandusetypeid','rawcensustractandblock','regionidcounty','regionidcity',
            'regionidzip','regionidneighborhood',
           'storytypeid','typeconstructiontypeid','yearbuilt','fireplaceflag',
           'taxdelinquencyflag']

In [222]:
def preprocess(data):
    X = data[num_atts + cat_atts].copy()
    # fill in missing values
    for c in num_atts:
        X[c].fillna(X[c].mean(skipna=True), inplace=True)
        X[c] = X[c].astype('float32')

    for c in cat_atts:
        if X[c].dtype == object:
    #         print(c)
            X[c].fillna("-1", inplace=True)
            X[c] = X[c].astype(str)
        else:
            X[c].fillna(-1, inplace=True)
        lbl = LabelEncoder()
        lbl.fit(list(X[c].values))
        X[c] = lbl.transform(X[c])
        X[c] = X[c].astype('float32')
    return X

In [152]:
X.memory_usage().sum()

15878791

## Submission functions

In [144]:
def generate_submissions(oct_model,nov_model,dec_model,name='new_submission',logy=True):
    """
    This function creates the submission file for the public leaderboard predictions.
    Three already fitted models, one for each of the predicting time points, is required.
    """
    submission_df = DataFrame()
    for i in range(int(properties.shape[0] / 100000)):
        all_feats = preprocess(properties.iloc[i*100000:(i+1)*100000])
        foo = properties.iloc[i*100000:(i+1)*100000][['parcelid']].reset_index(drop=True)
        if logy:
            foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                            '201611': nov_model.predict(all_feats),
                                                            '201612': dec_model.predict(all_feats)})], axis=1)
        else:
            foo = pd.concat([foo, DataFrame({'201610': np.log(oct_model.predict(all_feats)),
                                                            '201611': np.log(nov_model.predict(all_feats)),
                                                            '201612': np.log(dec_model.predict(all_feats))})], axis=1)
        submission_df = pd.concat([submission_df, foo], ignore_index=True)

    #  fencepost problem
    all_feats = preprocess(properties.iloc[2900000:])
    foo = properties.iloc[2900000:][['parcelid']].reset_index(drop=True)
    foo = pd.concat([foo, DataFrame({'201610': oct_model.predict(all_feats),
                                                    '201611': nov_model.predict(all_feats),
                                                    '201612': dec_model.predict(all_feats)})], axis=1)
    submission_df = pd.concat([submission_df, foo], ignore_index=True)
    
    submission_df['201710'] = 0
    submission_df['201711'] = 0
    submission_df['201712'] = 0
    
    submission_df.rename(columns={'parcelid':'ParcelId'}, inplace=True)    
    submission_df[['201610','201611','201612','201710','201711','201712']]= submission_df[['201610','201611','201612',
                                                                                           '201710','201711','201712']].round(4)
    # unit test
    submission_df.drop_duplicates(inplace=True)
    assert submission_df.shape[0] == properties.shape[0]
    # write to .csv
    submission_df[['ParcelId','201610','201611','201612',
                  '201710','201711','201712']].round(4).to_csv(name + ".csv", index=False)
    return submission_df

In [121]:
def mean_absolute_errors(submission_df, comparison_df):
    """
    This function takes a submission entry for public leaderboard, and returns
    the training error for each month.
    """
    # training error
    trainresults = pd.merge(submission_df[['ParcelId','201610','201611','201612']], comparison_df[['parcelid','logerror','month']],
                           left_on='ParcelId', right_on='parcelid')
    oct_error = abs(trainresults[trainresults['month'] == 10]['201610'] 
                    - trainresults[trainresults['month'] == 10]['logerror']).mean()
    nov_error = abs(trainresults[trainresults['month'] == 11]['201611'] 
                    - trainresults[trainresults['month'] == 11]['logerror']).mean()
    dec_error = abs(trainresults[trainresults['month'] == 12]['201612'] 
                    - trainresults[trainresults['month'] == 12]['logerror']).mean()
    overall_mae = (oct_error*(trainresults['month'] == 10).sum() + nov_error*(trainresults['month'] == 11).sum() 
                        + dec_error*(trainresults['month'] == 12).sum()) / (trainresults['month'].isin([10,11,12])).sum()
    return (oct_error, nov_error, dec_error, overall_mae)

## Train/Test Set Split

In [123]:
from sklearn.model_selection import train_test_split

In [129]:
traindata, testdata = train_test_split(data, test_size = 0.2, random_state=9)
traindata = traindata.reset_index(drop=True)
testdata = testdata.reset_index(drop=True)

## Sample weights

In [130]:
p75 = np.percentile(data['logerror'],75)
p25 = np.percentile(data['logerror'],25)

In [131]:
extremelogerrors = ((data['logerror'] >= p75) | (traindata['logerror'] <= p25))*0
traindata['wts_oct'] = extremelogerrors + np.where(traindata['month'] == 10, 2, 1)
traindata['wts_nov'] = extremelogerrors + np.where(traindata['month'] == 11, 2, 1)
traindata['wts_dec'] = extremelogerrors + np.where(traindata['month'] == 12, 2, 1)

# Random Forests

In [153]:
X_train = preprocess(traindata)
Y_train = traindata['logerror']

In [135]:
from sklearn.ensemble import RandomForestRegressor

In [136]:
oct_regr_rf = RandomForestRegressor(n_estimators = 200, max_features = 9, random_state=42, max_depth=25, criterion='mse')
nov_regr_rf = RandomForestRegressor(n_estimators = 200, max_features = 9, random_state=42, max_depth=25, criterion='mse')
dec_regr_rf = RandomForestRegressor(n_estimators = 200, max_features = 9, random_state=42, max_depth=25, criterion='mse')

oct_regr_rf.fit(X_train,Y_train, sample_weight=traindata['wts_oct'])
nov_regr_rf.fit(X_train,Y_train, sample_weight=traindata['wts_nov'])
dec_regr_rf.fit(X_train,Y_train, sample_weight=traindata['wts_dec'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=25,
           max_features=9, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [145]:
submission_df = generate_submissions(oct_regr_rf, nov_regr_rf, dec_regr_rf)

In [146]:
mean_absolute_errors(submission_df, traindata)

(0.05620112640800991,
 0.056164245810055885,
 0.06544034833091437,
 0.0580642909625275)

In [147]:
mean_absolute_errors(submission_df, testdata)

(0.0665411405295316,
 0.06945380710659899,
 0.08799916897506922,
 0.071661427748992534)

In [148]:
mean_absolute_errors(submission_df, data)

(0.058241289933694845,
 0.05903176341730551,
 0.07012334675100626,
 0.060829243736829672)

# XGB

In [142]:
import xgboost as xgb
from xgboost import XGBRegressor

### Validation Set

In [232]:
data['wts_oct'] = np.where(data['month'] == 10, 1.5, 1)
data['wts_nov'] = np.where(data['month'] == 11, 1.5, 1)
data['wts_dec'] = np.where(data['month'] == 12, 1.5, 1)

In [233]:
traindata, valdata = train_test_split(data, test_size = 0.1, random_state=9)

In [234]:
valdata.shape

(9028, 63)

In [236]:
X_train = preprocess(data)
Y_train = data['logerror']

In [237]:
d_train = xgb.DMatrix(X_train,label=Y_train)
# d_val = xgb.DMatrix(X_val,label=Y_val)
# watchlist = [(d_train, 'train'), (d_val, 'val')]

In [238]:
oct_xgb = XGBRegressor(seed=42, n_estimators=1000, max_depth=4, learning_rate=0.02)
nov_xgb = XGBRegressor(seed=42, n_estimators=1000, max_depth=4, learning_rate=0.02)
dec_xgb = XGBRegressor(seed=42, n_estimators=1000, max_depth=4, learning_rate=0.02)

In [239]:
print("Training")
print("training oct model...")
oct_xgb.fit(X_train,Y_train,sample_weight=traindata['wts_oct'], eval_metric='mae')
print("training nov model...")
nov_xgb.fit(X_train,Y_train,sample_weight=traindata['wts_nov'], eval_metric='mae')
print("training dec model...")
dec_xgb.fit(X_train,Y_train,sample_weight=traindata['wts_dec'], eval_metric='mae')

Training
training oct model...
training nov model...
training dec model...


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=1)

In [240]:
submission_df = generate_submissions(oct_xgb, nov_xgb, dec_xgb, name="XGB_1000")

In [242]:
submission_df.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,,,,0,0,0
1,10759547,,,,0,0,0
2,10843547,,,,0,0,0
3,10859147,,,,0,0,0
4,10879947,,,,0,0,0


In [241]:
mean_absolute_errors(submission_df,data)

(nan, nan, nan, nan)