In [1]:
import numpy as np
import pandas as pd

import lightgbm as lgb

from sklearn.feature_selection import VarianceThreshold

from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import time
import datetime as dt
import gc

In [2]:
train_datapath_2016 = "./zillow data/merged_2016 v4.csv"
train_datapath_2017 = "./zillow data/merged_2017 v4.csv"

test_datapath_2016 = "./zillow data/properties_2016 v4.csv"
test_datapath_2017 = "./zillow data/properties_2017 v4.csv"

In [3]:
# modify 'transactiondate' of df to keep only the month
def parse_transactiondate(df):
    # modify transactiondate col to keep only the month
    transactiondate_year = []
    transactiondate_month = [] 
    transactiondate_quarter = []
    for data in df['transactiondate']:
        temp = data.split("-")
        year = int(temp[0])
        transactiondate_year.append(year) # keep only month
        month = int(temp[1])
        month = (year - 2016)*12 + month
        transactiondate_month.append(month)
        quarter = (year-2016)*4 + int((month-1)/3) + 1
        transactiondate_quarter.append(quarter)
        
    transactiondate_year = np.array(transactiondate_year)
    transactiondate_year = transactiondate_year.reshape(transactiondate_year.shape[0], 1)
    transactiondate_month = np.array(transactiondate_month)
    transactiondate_month = transactiondate_month.reshape(transactiondate_month.shape[0], 1)
    transactiondate_quarter = np.array(transactiondate_quarter)
    transactiondate_quarter = transactiondate_month.reshape(transactiondate_quarter.shape[0], 1)

    df['transactiondate_year'] = transactiondate_year
    df['transactiondate_month'] = transactiondate_month
    df['transactiondate_quarter'] = transactiondate_quarter
    df = df.drop('transactiondate',axis = 1)
    return df

# return label index, feature index list
# assumes that df label_is_first
def split_on_label(df):
    return df.columns[0], df.columns[1:]

def gen_testdata(df, transactiondate_year,transactiondate_month,transactiondate_quarter):
    df_copy = df.copy()
    # df_copy['transactiondate'] = new_transactiondate
    df_copy.insert(0, 'transactiondate_year', transactiondate_year)
    df_copy.insert(0, 'transactiondate_month', transactiondate_month)
    df_copy.insert(0, 'transactiondate_quarter', transactiondate_quarter)
    return df_copy

def get_low_var_feature(support_list):
    low_var_feature_index = []
    support_feature_index = []
    for i in range(len(support_list)):
        if support_list[i]:
            support_feature_index.append(i)
        else:
            low_var_feature_index.append(i)
    return low_var_feature_index, support_feature_index


def handle_low_var(sel, train_data, feature_list):
    temp_sel = sel.fit(train_data)
    low_var_list, support_list = get_low_var_feature(sel.get_support())

    return low_var_list


# Removing Features with more than 98% missing data
def removing_missing(X_train, missing_threshold = 0.95):
    exclude_missing = []
    num_rows = X_train.shape[0]
    for col in X_train.columns:
        num_missing = X_train[col].isna().sum()
        if num_missing == 0:
            continue
        missing_fraction = num_missing/float(num_rows)
        if missing_fraction > missing_threshold:
            exclude_missing.append(col)
            
    return exclude_missing

# removing features with unique value
def removing_unique(X_train):
    exclude_unique = []
    for col in X_train.columns:
        num_unique = len(X_train[col].unique())
        if X_train[col].isna().sum()!=0 and num_unique == 1:
            exclude_unique.append(col)
            
    return exclude_unique

def print_list(li,list_name = ''):
    print('%s (%d) :' %(list_name,len(li)))
    for i in li:
        print(i)
        
def get_train_features(X_train,el1 = [], el2 = [], el3 = [],el4 = []):
    train_feature = []
    for col in X_train:
        if col not in el1 and col not in el2 and col not in el3 and col not in el4:
            train_feature.append(col)
    return train_feature

# geting categorical features
def get_cat_feature(X_train, cat_threshold):
    cat_feature_inds = []
    cat_feature = []
    for i,col in enumerate(X_train):
        num_uniques = len(X_train[col].unique())
        if num_uniques < cat_threshold and not 'sqft' in col and not 'cnt' in col \
        and not 'nbr' in col and not 'number' in col:
            cat_feature_inds.append(i)
            cat_feature.append(col)
    return cat_feature_inds,cat_feature

In [4]:
# read training data
train_data16 = pd.read_csv(train_datapath_2016, index_col=0)
train_data17 = pd.read_csv(train_datapath_2017, index_col=0)

# drop parcelid (index of properties features)
train_data16 = train_data16.drop('parcelid', axis=1)
train_data17 = train_data17.drop('parcelid', axis=1)

# modify 'transactiondate' to keep only the month
train_data16 = parse_transactiondate(train_data16)
train_data17 = parse_transactiondate(train_data17)

train_data = pd.concat([train_data16,train_data17],axis = 0)

label, feature = split_on_label(train_data)

# training data and label for model 2016
X_train = train_data[feature]

y_train = train_data[label]

del train_data16, train_data17
gc.collect()

print("[2016] num of features:", len(feature))
print("[2016] num of instances:", train_data.shape[0])

[2016] num of features: 60
[2016] num of instances: 167888


In [5]:
# Removing Freatures with too many missing data
exclude_missing = removing_missing(X_train)

# Removing Features with unique values
exclude_unique = removing_unique(X_train)

# Removing Features with low variance
# it should remove features with many missing values that were filled by imputation
#sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

#exclude_low_var16 = handle_low_var(sel, X_train16, feature16)
#exclude_low_var17 = handle_low_var(sel, X_train17, feature17)

In [6]:
X_train.fillna(-999,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [7]:
print_list(exclude_missing,'exclude_missing')
#print_list(exclude_low_var16,'exclude_low_var16')
print_list(exclude_unique,'exclude_unique')

exclude_missing (18) :
architecturalstyletypeid
basementsqft
buildingclasstypeid
decktypeid
finishedsquarefeet13
finishedsquarefeet15
finishedsquarefeet6
hashottuborspa
poolsizesum
pooltypeid10
pooltypeid2
storytypeid
typeconstructiontypeid
yardbuildingsqft17
yardbuildingsqft26
fireplaceflag
taxdelinquencyflag
taxdelinquencyyear
exclude_unique (0) :


In [8]:
train_feature = get_train_features(X_train, el1 = exclude_missing, 
                                                #el2 = exclude_low_var16,
                                                el3 = exclude_unique)

new_X_train = X_train[train_feature]

print("\n[2016] num of features:", new_X_train.shape[1])
print_list(train_feature,"train_feature16")


[2016] num of features: 42
train_feature16 (42) :
airconditioningtypeid
bathroomcnt
bedroomcnt
buildingqualitytypeid
calculatedbathnbr
finishedfloor1squarefeet
calculatedfinishedsquarefeet
finishedsquarefeet12
finishedsquarefeet50
fips
fireplacecnt
fullbathcnt
garagecarcnt
garagetotalsqft
heatingorsystemtypeid
latitude
longitude
lotsizesquarefeet
poolcnt
pooltypeid7
propertycountylandusecode
propertylandusetypeid
propertyzoningdesc
rawcensustractandblock
regionidcity
regionidcounty
regionidneighborhood
regionidzip
roomcnt
threequarterbathnbr
unitcnt
yearbuilt
numberofstories
structuretaxvaluedollarcnt
taxvaluedollarcnt
assessmentyear
landtaxvaluedollarcnt
taxamount
censustractandblock
transactiondate_year
transactiondate_month
transactiondate_quarter


In [19]:
cat_feature_inds, cat_feature = get_cat_feature(new_X_train, cat_threshold = 1000)
print_list(cat_feature,'cat_feature')

cat_feature (16) :
airconditioningtypeid
buildingqualitytypeid
fips
heatingorsystemtypeid
pooltypeid7
propertycountylandusecode
propertylandusetypeid
regionidcity
regionidcounty
regionidneighborhood
regionidzip
yearbuilt
assessmentyear
transactiondate_year
transactiondate_month
transactiondate_quarter


### Feauture Selection

In [12]:
from catboost import CatBoostRegressor
def handle_catboost_RCV(X, y, param_dict,N):
    
    # cross-validate on alpha (regularization strenght) from alphalist
    
    model = CatBoostRegressor(n_jobs = -1, random_state = 42,silent = False)
       
    # cross-validate on random search CV
    random_search = RandomSearchCV(model,param_distributions=param_dict,
                                 n_iter = N,random_state = 21,
                                 cv=5, scoring='neg_mean_absolute_error',verbose = True)
    
    start = time.time()
    grid_search.fit(X, y)
    print("RandomSearchCV took %.2f seconds"
          " parameter settings." % (time.time() - start))

    return random_search

In [21]:
def handle_catboost_GridCV(X, y, param_dict,cat_inds):
    
    # cross-validate on alpha (regularization strenght) from alphalist
    
    model = CatBoostRegressor(random_state = 42,silent = False)
       
    # cross-validate on random search CV
    grid_search = GridSearchCV(model,param_grid=param_dict,
                               cv=5, scoring='neg_mean_absolute_error',verbose = True)
    
    start = time.time()
    grid_search.fit(X, y,cat_features = cat_inds)
    print("GridSearchCV took %.2f seconds"
          " parameter settings." % (time.time() - start))

    return grid_search

In [None]:
params_dict = {
    'learning_rate':[0.03],
    'depth':[6],
    'l2_leaf_reg':[3],
    'iterations':[600]
}
grid_result = handle_catboost_GridCV(new_X_train, y_train, params_dict, cat_feature_inds)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
0:	learn: 0.1642779	total: 468ms	remaining: 4m 40s
1:	learn: 0.1642050	total: 823ms	remaining: 4m 6s
2:	learn: 0.1641348	total: 1.16s	remaining: 3m 51s
3:	learn: 0.1640798	total: 1.59s	remaining: 3m 57s
4:	learn: 0.1640169	total: 2s	remaining: 3m 57s
5:	learn: 0.1639631	total: 2.38s	remaining: 3m 55s
6:	learn: 0.1639082	total: 2.76s	remaining: 3m 53s
7:	learn: 0.1638624	total: 3.13s	remaining: 3m 51s
8:	learn: 0.1638119	total: 3.52s	remaining: 3m 51s
9:	learn: 0.1637796	total: 3.93s	remaining: 3m 51s
10:	learn: 0.1637334	total: 4.34s	remaining: 3m 52s
11:	learn: 0.1636880	total: 4.73s	remaining: 3m 51s
12:	learn: 0.1636470	total: 5.12s	remaining: 3m 51s
13:	learn: 0.1636083	total: 5.49s	remaining: 3m 49s
14:	learn: 0.1635673	total: 5.86s	remaining: 3m 48s
15:	learn: 0.1635314	total: 6.23s	remaining: 3m 47s
16:	learn: 0.1634968	total: 6.62s	remaining: 3m 47s
17:	learn: 0.1634763	total: 7.11s	remaining: 3m 49s
18:	learn: 0.163447

156:	learn: 0.1622370	total: 1m 15s	remaining: 3m 33s
157:	learn: 0.1622324	total: 1m 16s	remaining: 3m 32s
158:	learn: 0.1622306	total: 1m 16s	remaining: 3m 32s
159:	learn: 0.1622301	total: 1m 17s	remaining: 3m 31s
160:	learn: 0.1622265	total: 1m 17s	remaining: 3m 31s
161:	learn: 0.1622215	total: 1m 17s	remaining: 3m 30s
162:	learn: 0.1622180	total: 1m 18s	remaining: 3m 29s
163:	learn: 0.1622102	total: 1m 18s	remaining: 3m 29s
164:	learn: 0.1621985	total: 1m 19s	remaining: 3m 28s
165:	learn: 0.1621962	total: 1m 19s	remaining: 3m 28s
166:	learn: 0.1621929	total: 1m 20s	remaining: 3m 27s
167:	learn: 0.1621846	total: 1m 20s	remaining: 3m 27s
168:	learn: 0.1621783	total: 1m 20s	remaining: 3m 26s
169:	learn: 0.1621735	total: 1m 21s	remaining: 3m 25s
170:	learn: 0.1621706	total: 1m 21s	remaining: 3m 25s
171:	learn: 0.1621657	total: 1m 22s	remaining: 3m 25s
172:	learn: 0.1621643	total: 1m 22s	remaining: 3m 24s
173:	learn: 0.1621586	total: 1m 23s	remaining: 3m 24s
174:	learn: 0.1621522	total:

309:	learn: 0.1617049	total: 2m 25s	remaining: 2m 16s
310:	learn: 0.1617049	total: 2m 26s	remaining: 2m 15s
311:	learn: 0.1617038	total: 2m 26s	remaining: 2m 15s
312:	learn: 0.1616979	total: 2m 27s	remaining: 2m 15s
313:	learn: 0.1616969	total: 2m 28s	remaining: 2m 15s
314:	learn: 0.1616945	total: 2m 28s	remaining: 2m 14s
315:	learn: 0.1616924	total: 2m 29s	remaining: 2m 14s
316:	learn: 0.1616918	total: 2m 29s	remaining: 2m 13s
317:	learn: 0.1616912	total: 2m 30s	remaining: 2m 13s
318:	learn: 0.1616887	total: 2m 30s	remaining: 2m 12s
319:	learn: 0.1616867	total: 2m 31s	remaining: 2m 12s
320:	learn: 0.1616826	total: 2m 31s	remaining: 2m 11s
321:	learn: 0.1616769	total: 2m 32s	remaining: 2m 11s
322:	learn: 0.1616749	total: 2m 33s	remaining: 2m 11s
323:	learn: 0.1616688	total: 2m 33s	remaining: 2m 10s
324:	learn: 0.1616675	total: 2m 34s	remaining: 2m 10s
325:	learn: 0.1616657	total: 2m 34s	remaining: 2m 10s
326:	learn: 0.1616656	total: 2m 35s	remaining: 2m 9s
327:	learn: 0.1616602	total: 

462:	learn: 0.1612603	total: 3m 38s	remaining: 1m 4s
463:	learn: 0.1612503	total: 3m 38s	remaining: 1m 4s
464:	learn: 0.1612434	total: 3m 38s	remaining: 1m 3s
465:	learn: 0.1612400	total: 3m 39s	remaining: 1m 3s
466:	learn: 0.1612352	total: 3m 39s	remaining: 1m 2s
467:	learn: 0.1612338	total: 3m 40s	remaining: 1m 2s
468:	learn: 0.1612333	total: 3m 40s	remaining: 1m 1s
469:	learn: 0.1612316	total: 3m 41s	remaining: 1m 1s
470:	learn: 0.1612291	total: 3m 41s	remaining: 1m
471:	learn: 0.1612284	total: 3m 42s	remaining: 1m
472:	learn: 0.1612251	total: 3m 42s	remaining: 59.8s
473:	learn: 0.1612244	total: 3m 43s	remaining: 59.3s
474:	learn: 0.1612243	total: 3m 43s	remaining: 58.8s
475:	learn: 0.1612193	total: 3m 44s	remaining: 58.4s
476:	learn: 0.1612180	total: 3m 44s	remaining: 57.9s
477:	learn: 0.1612149	total: 3m 44s	remaining: 57.4s
478:	learn: 0.1612131	total: 3m 45s	remaining: 57s
479:	learn: 0.1612125	total: 3m 46s	remaining: 56.5s
480:	learn: 0.1612097	total: 3m 46s	remaining: 56s
481

19:	learn: 0.1685259	total: 11.2s	remaining: 5m 23s
20:	learn: 0.1684924	total: 11.6s	remaining: 5m 19s
21:	learn: 0.1684551	total: 12s	remaining: 5m 15s
22:	learn: 0.1684229	total: 12.4s	remaining: 5m 11s
23:	learn: 0.1684076	total: 13s	remaining: 5m 13s
24:	learn: 0.1683712	total: 13.6s	remaining: 5m 11s
25:	learn: 0.1683545	total: 14.1s	remaining: 5m 10s
26:	learn: 0.1683370	total: 14.5s	remaining: 5m 8s
27:	learn: 0.1683110	total: 14.9s	remaining: 5m 4s
28:	learn: 0.1682984	total: 15.4s	remaining: 5m 2s
29:	learn: 0.1682664	total: 15.7s	remaining: 4m 58s
30:	learn: 0.1682500	total: 16.4s	remaining: 5m 1s
31:	learn: 0.1682251	total: 17.1s	remaining: 5m 3s
32:	learn: 0.1682166	total: 17.7s	remaining: 5m 3s
33:	learn: 0.1682025	total: 18.2s	remaining: 5m 2s
34:	learn: 0.1681894	total: 18.6s	remaining: 4m 59s
35:	learn: 0.1681694	total: 19s	remaining: 4m 57s
36:	learn: 0.1681540	total: 19.4s	remaining: 4m 55s
37:	learn: 0.1681434	total: 19.8s	remaining: 4m 52s
38:	learn: 0.1681280	tota

177:	learn: 0.1672224	total: 1m 15s	remaining: 2m 59s
178:	learn: 0.1672196	total: 1m 16s	remaining: 2m 59s
179:	learn: 0.1672133	total: 1m 16s	remaining: 2m 58s
180:	learn: 0.1672100	total: 1m 17s	remaining: 2m 58s
181:	learn: 0.1672016	total: 1m 17s	remaining: 2m 57s
182:	learn: 0.1671981	total: 1m 17s	remaining: 2m 57s
183:	learn: 0.1671954	total: 1m 18s	remaining: 2m 57s
184:	learn: 0.1671944	total: 1m 18s	remaining: 2m 56s
185:	learn: 0.1671845	total: 1m 19s	remaining: 2m 56s
186:	learn: 0.1671811	total: 1m 19s	remaining: 2m 55s
187:	learn: 0.1671708	total: 1m 19s	remaining: 2m 55s
188:	learn: 0.1671686	total: 1m 20s	remaining: 2m 54s
189:	learn: 0.1671669	total: 1m 20s	remaining: 2m 54s
190:	learn: 0.1671650	total: 1m 21s	remaining: 2m 53s
191:	learn: 0.1671623	total: 1m 21s	remaining: 2m 53s
192:	learn: 0.1671577	total: 1m 22s	remaining: 2m 52s
193:	learn: 0.1671563	total: 1m 22s	remaining: 2m 52s
194:	learn: 0.1671547	total: 1m 22s	remaining: 2m 52s
195:	learn: 0.1671400	total:

330:	learn: 0.1666212	total: 2m 17s	remaining: 1m 51s
331:	learn: 0.1666199	total: 2m 18s	remaining: 1m 51s
332:	learn: 0.1666185	total: 2m 18s	remaining: 1m 51s
333:	learn: 0.1666150	total: 2m 18s	remaining: 1m 50s
334:	learn: 0.1666116	total: 2m 19s	remaining: 1m 50s
335:	learn: 0.1666093	total: 2m 19s	remaining: 1m 49s
336:	learn: 0.1666077	total: 2m 20s	remaining: 1m 49s
337:	learn: 0.1666067	total: 2m 20s	remaining: 1m 48s
338:	learn: 0.1666058	total: 2m 21s	remaining: 1m 48s
339:	learn: 0.1666042	total: 2m 21s	remaining: 1m 48s
340:	learn: 0.1666036	total: 2m 21s	remaining: 1m 47s
341:	learn: 0.1665973	total: 2m 22s	remaining: 1m 47s
342:	learn: 0.1665852	total: 2m 22s	remaining: 1m 46s
343:	learn: 0.1665813	total: 2m 22s	remaining: 1m 46s
344:	learn: 0.1665759	total: 2m 23s	remaining: 1m 45s
345:	learn: 0.1665730	total: 2m 23s	remaining: 1m 45s
346:	learn: 0.1665710	total: 2m 24s	remaining: 1m 45s
347:	learn: 0.1665666	total: 2m 24s	remaining: 1m 44s
348:	learn: 0.1665588	total:

484:	learn: 0.1661952	total: 3m 24s	remaining: 48.4s
485:	learn: 0.1661951	total: 3m 24s	remaining: 47.9s
486:	learn: 0.1661940	total: 3m 24s	remaining: 47.5s
487:	learn: 0.1661876	total: 3m 25s	remaining: 47.1s
488:	learn: 0.1661837	total: 3m 25s	remaining: 46.7s
489:	learn: 0.1661827	total: 3m 25s	remaining: 46.2s
490:	learn: 0.1661814	total: 3m 26s	remaining: 45.8s
491:	learn: 0.1661805	total: 3m 26s	remaining: 45.4s
492:	learn: 0.1661799	total: 3m 27s	remaining: 45s
493:	learn: 0.1661782	total: 3m 27s	remaining: 44.5s
494:	learn: 0.1661713	total: 3m 27s	remaining: 44.1s
495:	learn: 0.1661702	total: 3m 28s	remaining: 43.7s
496:	learn: 0.1661663	total: 3m 28s	remaining: 43.3s
497:	learn: 0.1661650	total: 3m 29s	remaining: 42.8s
498:	learn: 0.1661540	total: 3m 29s	remaining: 42.4s
499:	learn: 0.1661496	total: 3m 29s	remaining: 42s
500:	learn: 0.1661478	total: 3m 30s	remaining: 41.6s
501:	learn: 0.1661457	total: 3m 30s	remaining: 41.1s
502:	learn: 0.1661437	total: 3m 31s	remaining: 40.

41:	learn: 0.1658598	total: 16.4s	remaining: 3m 38s
42:	learn: 0.1658443	total: 16.8s	remaining: 3m 37s
43:	learn: 0.1658344	total: 17.2s	remaining: 3m 37s
44:	learn: 0.1658210	total: 17.6s	remaining: 3m 36s
45:	learn: 0.1658139	total: 18s	remaining: 3m 36s
46:	learn: 0.1658088	total: 18.4s	remaining: 3m 36s
47:	learn: 0.1658047	total: 18.8s	remaining: 3m 36s
48:	learn: 0.1657935	total: 19.2s	remaining: 3m 36s
49:	learn: 0.1657764	total: 19.5s	remaining: 3m 35s
50:	learn: 0.1657642	total: 19.9s	remaining: 3m 34s
51:	learn: 0.1657569	total: 20.3s	remaining: 3m 33s
52:	learn: 0.1657427	total: 20.7s	remaining: 3m 33s
53:	learn: 0.1657389	total: 21.1s	remaining: 3m 33s
54:	learn: 0.1657262	total: 21.5s	remaining: 3m 33s
55:	learn: 0.1657177	total: 22s	remaining: 3m 33s
56:	learn: 0.1657125	total: 22.6s	remaining: 3m 35s
57:	learn: 0.1657070	total: 23.1s	remaining: 3m 35s
58:	learn: 0.1657028	total: 23.6s	remaining: 3m 35s
59:	learn: 0.1656964	total: 24s	remaining: 3m 36s
60:	learn: 0.16568

197:	learn: 0.1649721	total: 1m 29s	remaining: 3m 2s
198:	learn: 0.1649692	total: 1m 30s	remaining: 3m 1s
199:	learn: 0.1649676	total: 1m 30s	remaining: 3m 1s
200:	learn: 0.1649648	total: 1m 31s	remaining: 3m
201:	learn: 0.1649620	total: 1m 31s	remaining: 3m
202:	learn: 0.1649562	total: 1m 32s	remaining: 3m
203:	learn: 0.1649502	total: 1m 32s	remaining: 2m 59s
204:	learn: 0.1649436	total: 1m 32s	remaining: 2m 59s
205:	learn: 0.1649326	total: 1m 33s	remaining: 2m 58s
206:	learn: 0.1649253	total: 1m 33s	remaining: 2m 58s
207:	learn: 0.1649231	total: 1m 34s	remaining: 2m 57s
208:	learn: 0.1649201	total: 1m 34s	remaining: 2m 57s
209:	learn: 0.1649179	total: 1m 35s	remaining: 2m 57s
210:	learn: 0.1649169	total: 1m 35s	remaining: 2m 56s
211:	learn: 0.1649149	total: 1m 36s	remaining: 2m 56s
212:	learn: 0.1649141	total: 1m 36s	remaining: 2m 55s
213:	learn: 0.1649116	total: 1m 37s	remaining: 2m 55s
214:	learn: 0.1649078	total: 1m 37s	remaining: 2m 54s
215:	learn: 0.1649026	total: 1m 38s	remaini

351:	learn: 0.1644272	total: 2m 36s	remaining: 1m 50s
352:	learn: 0.1644257	total: 2m 36s	remaining: 1m 49s
353:	learn: 0.1644252	total: 2m 37s	remaining: 1m 49s
354:	learn: 0.1644232	total: 2m 37s	remaining: 1m 48s
355:	learn: 0.1644214	total: 2m 38s	remaining: 1m 48s
356:	learn: 0.1644194	total: 2m 38s	remaining: 1m 47s
357:	learn: 0.1644166	total: 2m 38s	remaining: 1m 47s
358:	learn: 0.1644161	total: 2m 39s	remaining: 1m 46s
359:	learn: 0.1644128	total: 2m 39s	remaining: 1m 46s
360:	learn: 0.1644109	total: 2m 40s	remaining: 1m 45s
361:	learn: 0.1644093	total: 2m 40s	remaining: 1m 45s
362:	learn: 0.1644025	total: 2m 40s	remaining: 1m 45s
363:	learn: 0.1644022	total: 2m 41s	remaining: 1m 44s
364:	learn: 0.1644002	total: 2m 41s	remaining: 1m 44s
365:	learn: 0.1643974	total: 2m 42s	remaining: 1m 43s
366:	learn: 0.1643936	total: 2m 42s	remaining: 1m 43s
367:	learn: 0.1643895	total: 2m 42s	remaining: 1m 42s
368:	learn: 0.1643882	total: 2m 43s	remaining: 1m 42s
369:	learn: 0.1643825	total:

505:	learn: 0.1640021	total: 3m 42s	remaining: 41.3s
506:	learn: 0.1640008	total: 3m 42s	remaining: 40.8s
507:	learn: 0.1639987	total: 3m 43s	remaining: 40.4s
508:	learn: 0.1639956	total: 3m 43s	remaining: 40s
509:	learn: 0.1639950	total: 3m 43s	remaining: 39.5s
510:	learn: 0.1639928	total: 3m 44s	remaining: 39.1s
511:	learn: 0.1639905	total: 3m 44s	remaining: 38.6s
512:	learn: 0.1639903	total: 3m 45s	remaining: 38.2s
513:	learn: 0.1639883	total: 3m 45s	remaining: 37.8s
514:	learn: 0.1639800	total: 3m 46s	remaining: 37.3s
515:	learn: 0.1639785	total: 3m 46s	remaining: 36.9s
516:	learn: 0.1639738	total: 3m 46s	remaining: 36.4s
517:	learn: 0.1639719	total: 3m 47s	remaining: 36s
518:	learn: 0.1639684	total: 3m 47s	remaining: 35.5s
519:	learn: 0.1639674	total: 3m 48s	remaining: 35.1s
520:	learn: 0.1639659	total: 3m 48s	remaining: 34.7s
521:	learn: 0.1639596	total: 3m 49s	remaining: 34.2s
522:	learn: 0.1639580	total: 3m 49s	remaining: 33.8s
523:	learn: 0.1639575	total: 3m 50s	remaining: 33.

63:	learn: 0.1662733	total: 26.3s	remaining: 3m 39s
64:	learn: 0.1662667	total: 26.6s	remaining: 3m 38s
65:	learn: 0.1662561	total: 27s	remaining: 3m 38s
66:	learn: 0.1662455	total: 27.4s	remaining: 3m 37s
67:	learn: 0.1662325	total: 27.8s	remaining: 3m 37s
68:	learn: 0.1662279	total: 28.2s	remaining: 3m 36s
69:	learn: 0.1662186	total: 28.5s	remaining: 3m 35s
70:	learn: 0.1662083	total: 28.9s	remaining: 3m 35s
71:	learn: 0.1662062	total: 29.3s	remaining: 3m 35s
72:	learn: 0.1661955	total: 29.7s	remaining: 3m 34s
73:	learn: 0.1661849	total: 30.1s	remaining: 3m 33s
74:	learn: 0.1661802	total: 30.5s	remaining: 3m 33s
75:	learn: 0.1661712	total: 30.8s	remaining: 3m 32s
76:	learn: 0.1661646	total: 31.2s	remaining: 3m 31s
77:	learn: 0.1661614	total: 31.6s	remaining: 3m 31s
78:	learn: 0.1661556	total: 32s	remaining: 3m 30s
79:	learn: 0.1661501	total: 32.4s	remaining: 3m 30s
80:	learn: 0.1661482	total: 32.8s	remaining: 3m 30s
81:	learn: 0.1661388	total: 33.2s	remaining: 3m 29s
82:	learn: 0.166

219:	learn: 0.1653842	total: 1m 31s	remaining: 2m 38s
220:	learn: 0.1653757	total: 1m 32s	remaining: 2m 38s
221:	learn: 0.1653670	total: 1m 32s	remaining: 2m 37s
222:	learn: 0.1653620	total: 1m 33s	remaining: 2m 37s
223:	learn: 0.1653523	total: 1m 33s	remaining: 2m 36s
224:	learn: 0.1653405	total: 1m 33s	remaining: 2m 36s
225:	learn: 0.1653306	total: 1m 34s	remaining: 2m 36s
226:	learn: 0.1653272	total: 1m 34s	remaining: 2m 35s
227:	learn: 0.1653221	total: 1m 35s	remaining: 2m 35s
228:	learn: 0.1653178	total: 1m 35s	remaining: 2m 34s
229:	learn: 0.1653079	total: 1m 35s	remaining: 2m 34s
230:	learn: 0.1652975	total: 1m 36s	remaining: 2m 33s
231:	learn: 0.1652937	total: 1m 36s	remaining: 2m 33s
232:	learn: 0.1652923	total: 1m 37s	remaining: 2m 32s
233:	learn: 0.1652911	total: 1m 37s	remaining: 2m 32s
234:	learn: 0.1652856	total: 1m 37s	remaining: 2m 31s
235:	learn: 0.1652796	total: 1m 38s	remaining: 2m 31s
236:	learn: 0.1652758	total: 1m 38s	remaining: 2m 31s
237:	learn: 0.1652744	total:

372:	learn: 0.1647066	total: 2m 39s	remaining: 1m 36s
373:	learn: 0.1647020	total: 2m 39s	remaining: 1m 36s
374:	learn: 0.1647008	total: 2m 40s	remaining: 1m 36s
375:	learn: 0.1646982	total: 2m 40s	remaining: 1m 35s
376:	learn: 0.1646975	total: 2m 41s	remaining: 1m 35s
377:	learn: 0.1646900	total: 2m 41s	remaining: 1m 35s
378:	learn: 0.1646887	total: 2m 42s	remaining: 1m 34s
379:	learn: 0.1646876	total: 2m 42s	remaining: 1m 34s
380:	learn: 0.1646849	total: 2m 43s	remaining: 1m 33s
381:	learn: 0.1646807	total: 2m 43s	remaining: 1m 33s
382:	learn: 0.1646801	total: 2m 44s	remaining: 1m 32s
383:	learn: 0.1646787	total: 2m 44s	remaining: 1m 32s
384:	learn: 0.1646745	total: 2m 45s	remaining: 1m 32s
385:	learn: 0.1646720	total: 2m 45s	remaining: 1m 31s
386:	learn: 0.1646662	total: 2m 45s	remaining: 1m 31s
387:	learn: 0.1646658	total: 2m 46s	remaining: 1m 30s
388:	learn: 0.1646647	total: 2m 46s	remaining: 1m 30s
389:	learn: 0.1646637	total: 2m 47s	remaining: 1m 29s
390:	learn: 0.1646619	total:

527:	learn: 0.1641858	total: 3m 52s	remaining: 31.8s
528:	learn: 0.1641842	total: 3m 53s	remaining: 31.3s
529:	learn: 0.1641740	total: 3m 53s	remaining: 30.9s
530:	learn: 0.1641700	total: 3m 54s	remaining: 30.4s
531:	learn: 0.1641691	total: 3m 54s	remaining: 30s
532:	learn: 0.1641681	total: 3m 55s	remaining: 29.6s
533:	learn: 0.1641632	total: 3m 55s	remaining: 29.1s
534:	learn: 0.1641570	total: 3m 56s	remaining: 28.7s
535:	learn: 0.1641475	total: 3m 56s	remaining: 28.3s
536:	learn: 0.1641468	total: 3m 57s	remaining: 27.8s
537:	learn: 0.1641462	total: 3m 57s	remaining: 27.4s
538:	learn: 0.1641460	total: 3m 58s	remaining: 27s
539:	learn: 0.1641438	total: 3m 58s	remaining: 26.5s
540:	learn: 0.1641424	total: 3m 59s	remaining: 26.1s
541:	learn: 0.1641396	total: 3m 59s	remaining: 25.6s
542:	learn: 0.1641388	total: 4m	remaining: 25.2s
543:	learn: 0.1641363	total: 4m	remaining: 24.8s
544:	learn: 0.1641334	total: 4m 1s	remaining: 24.3s
545:	learn: 0.1641308	total: 4m 1s	remaining: 23.9s
546:	le

85:	learn: 0.1609509	total: 38.2s	remaining: 3m 48s
86:	learn: 0.1609347	total: 38.6s	remaining: 3m 47s
87:	learn: 0.1609264	total: 39s	remaining: 3m 46s
88:	learn: 0.1609220	total: 39.4s	remaining: 3m 46s
89:	learn: 0.1609133	total: 39.9s	remaining: 3m 45s
90:	learn: 0.1609014	total: 40.2s	remaining: 3m 44s
91:	learn: 0.1608966	total: 40.6s	remaining: 3m 44s
92:	learn: 0.1608939	total: 41s	remaining: 3m 43s
93:	learn: 0.1608933	total: 41.4s	remaining: 3m 42s
94:	learn: 0.1608853	total: 41.8s	remaining: 3m 42s
95:	learn: 0.1608751	total: 42.2s	remaining: 3m 41s
96:	learn: 0.1608673	total: 42.8s	remaining: 3m 41s
97:	learn: 0.1608657	total: 43.6s	remaining: 3m 43s
98:	learn: 0.1608510	total: 44.2s	remaining: 3m 43s
99:	learn: 0.1608479	total: 45s	remaining: 3m 44s
100:	learn: 0.1608455	total: 45.7s	remaining: 3m 45s
101:	learn: 0.1608372	total: 46.4s	remaining: 3m 46s
102:	learn: 0.1608331	total: 47.2s	remaining: 3m 47s
103:	learn: 0.1608184	total: 47.8s	remaining: 3m 47s
104:	learn: 0.

239:	learn: 0.1601741	total: 2m 2s	remaining: 3m 4s
240:	learn: 0.1601725	total: 2m 3s	remaining: 3m 3s
241:	learn: 0.1601702	total: 2m 3s	remaining: 3m 3s
242:	learn: 0.1601635	total: 2m 4s	remaining: 3m 2s
243:	learn: 0.1601609	total: 2m 4s	remaining: 3m 2s
244:	learn: 0.1601570	total: 2m 5s	remaining: 3m 1s
245:	learn: 0.1601514	total: 2m 5s	remaining: 3m 1s
246:	learn: 0.1601416	total: 2m 6s	remaining: 3m
247:	learn: 0.1601375	total: 2m 6s	remaining: 3m
248:	learn: 0.1601360	total: 2m 7s	remaining: 2m 59s
249:	learn: 0.1601350	total: 2m 7s	remaining: 2m 58s
250:	learn: 0.1601296	total: 2m 8s	remaining: 2m 58s
251:	learn: 0.1601268	total: 2m 8s	remaining: 2m 57s
252:	learn: 0.1601224	total: 2m 9s	remaining: 2m 56s
253:	learn: 0.1601132	total: 2m 9s	remaining: 2m 56s
254:	learn: 0.1601125	total: 2m 9s	remaining: 2m 55s
255:	learn: 0.1601082	total: 2m 10s	remaining: 2m 55s
256:	learn: 0.1601056	total: 2m 10s	remaining: 2m 54s
257:	learn: 0.1601018	total: 2m 11s	remaining: 2m 53s
258:	

393:	learn: 0.1595954	total: 3m 6s	remaining: 1m 37s
394:	learn: 0.1595924	total: 3m 6s	remaining: 1m 37s
395:	learn: 0.1595830	total: 3m 7s	remaining: 1m 36s
396:	learn: 0.1595773	total: 3m 7s	remaining: 1m 36s
397:	learn: 0.1595756	total: 3m 8s	remaining: 1m 35s
398:	learn: 0.1595679	total: 3m 8s	remaining: 1m 34s
399:	learn: 0.1595666	total: 3m 8s	remaining: 1m 34s
400:	learn: 0.1595591	total: 3m 9s	remaining: 1m 33s
401:	learn: 0.1595573	total: 3m 9s	remaining: 1m 33s
402:	learn: 0.1595507	total: 3m 10s	remaining: 1m 32s


In [79]:
grid_result.grid_scores_



[mean: -0.06907, std: 0.00183, params: {'colsample_bytree': 0.8, 'learning_rate': 0.03, 'max_depth': -1, 'min_child_samples': 100, 'min_child_weight': 0.001, 'min_split_gain': 0, 'n_eatimators': 512, 'num_leaves': 140, 'ramdon_state': 21, 'reg_alpha': 0.01, 'reg_lambda': 100, 'subsample': 0.8, 'subsample_for_bin': 50000, 'subsample_freq': 1}]

### Learning Curve

In [51]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve



In [50]:
def plot_learning_curve(estimator, X, y, ylim=None, cv=None, train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
    
    train_sizes, train_scores, test_scores = learning_curve(
            estimator, X, y, cv=cv, train_sizes=train_sizes, scoring = 'neg_mean_absolute_error',verbose=verbose)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    if plot:
        plt.figure()
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel(u"number of samples")
        plt.ylabel(u"score")
        plt.gca().invert_yaxis()
        plt.grid()
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                         alpha=0.1, color="b")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"training set score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"validation set score")

        plt.legend(loc="best")

        plt.draw()
        plt.show()
        plt.gca().invert_yaxis()

    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

In [52]:
bst = lgb.LGBMRegressor(n_jobs = -1, random_state = 42,silent = False,
                            learning_rate = 0.014,
                            n_estimators = 512,
                            num_leaves = 140,
                            reg_alpha = 0.01,
                            reg_lambda = 100,
                            min_split_gain = 0,
                            min_child_samples = 100,
                            subsample = 0.8,
                            subsample_for_bin = 50000,
                            colsample_bytree = 0.8,
                            subsample_freq = 1,
                            max_depth = -1,
                            min_child_weight = 0.001
                        
                           )
start = time.time()
_,_ = plot_learning_curve(bst, new_X_train, y_train)
during_time = time.time() - start
print(during_time)

KeyboardInterrupt: 

### Testing

In [56]:
# read in test
test_data2016 = pd.read_csv(test_datapath_2016, index_col=0)
test_data2017 = pd.read_csv(test_datapath_2017, index_col=0)

print("[2016] num of instances: ", test_data2016.shape[0])
print("[2017] num of instances: ", test_data2017.shape[0])

# save parcelid for merge 
test_parcelid16 = test_data2016['parcelid']
test_parcelid17 = test_data2017['parcelid']

# drop parcelid col
test_data2016 = test_data2016.drop('parcelid', axis=1)
test_data2017 = test_data2017.drop('parcelid', axis=1)

# generate transaction date
test_year = np.repeat(2016, test_data2016.shape[0])
test_quarter = np.repeat(4, test_data2016.shape[0])

test_month10_16 = np.repeat(10, test_data2016.shape[0])
test_month11_16 = np.repeat(11, test_data2016.shape[0])
test_month12_16 = np.repeat(12, test_data2016.shape[0])

test_month10_17 = np.repeat(22, test_data2017.shape[0])
test_month11_17 = np.repeat(23, test_data2017.shape[0])
test_month12_17 = np.repeat(24, test_data2017.shape[0])

# get new test data with transaction date
X_test10_16 = gen_testdata(test_data2016, test_year, test_month10_16,test_quarter)
X_test11_16 = gen_testdata(test_data2016, test_year, test_month11_16,test_quarter)
X_test12_16 = gen_testdata(test_data2016, test_year, test_month12_16,test_quarter)

X_test10_17 = gen_testdata(test_data2017, test_year, test_month10_17,test_quarter)
X_test11_17 = gen_testdata(test_data2017, test_year, test_month11_17,test_quarter)
X_test12_17 = gen_testdata(test_data2017, test_year, test_month12_17,test_quarter)

  mask |= (ar1 == a)


[2016] num of instances:  2985217
[2017] num of instances:  2985217


In [57]:
X_test10_16 = X_test10_16[train_feature]
X_test11_16 = X_test11_16[train_feature]
X_test12_16 = X_test12_16[train_feature]
X_test10_17 = X_test10_17[train_feature]
X_test11_17 = X_test11_17[train_feature]
X_test12_17 = X_test12_17[train_feature]

In [63]:
def train_and_test(X_train,y_train,X_test):
    start = time.time()
    bst = lgb.LGBMRegressor(n_jobs = -1, random_state = 42,silent = False,
                            learning_rate = 0.014,
                            n_estimators = 512,
                            num_leaves = 140,
                            reg_alpha = 0.01,
                            reg_lambda = 100,
                            min_split_gain = 0,
                            min_child_samples = 100,
                            subsample = 0.8,
                            subsample_for_bin = 50000,
                            colsample_bytree = 0.8,
                            subsample_freq = 1,
                            max_depth  = -1,
                            min_child_weight = 0.001
                           )
    bst.fit(X_train, y_train)
    y_pred = bst.predict(X_test)
    print('time using:%.2f'%(time.time()-start))
    
    return y_pred

In [64]:
y_pred10_16 = train_and_test(new_X_train, y_train, X_test10_16.values)
y_pred11_16 = train_and_test(new_X_train, y_train, X_test11_16.values)
y_pred12_16 = train_and_test(new_X_train, y_train, X_test12_16.values)
y_pred10_17 = train_and_test(new_X_train, y_train, X_test10_17.values)
y_pred11_17 = train_and_test(new_X_train, y_train, X_test11_17.values)
y_pred12_17 = train_and_test(new_X_train, y_train, X_test12_17.values)

time using:222.55
time using:196.50
time using:217.11
time using:168.77
time using:187.18
time using:185.88


In [65]:
# merged on Parcelid for predicted result on test data

test_dict_16 = {'Parcelid': test_parcelid16, '201610': y_pred10_16, '201611': y_pred11_16, '201612': y_pred12_16}
test_dict_17 = {'Parcelid': test_parcelid17, '201710': y_pred10_17, '201711': y_pred11_17, '201712': y_pred12_17}

df_test_16 = pd.DataFrame(data=test_dict_16)
df_test_17 = pd.DataFrame(data=test_dict_17)

df_merged = df_test_16.merge(df_test_17, left_on='Parcelid', right_on='Parcelid', how='outer')
print(df_merged)

# handle submission file
submitfile = "./zillow data/sample_submission.csv"

submit_df = pd.read_csv(submitfile)
print(submit_df.shape[0], submit_df.shape[1])

assert(submit_df.shape[0] == df_merged.shape[0]), "Error: invalid row size for submit!"
df_merged.to_csv("./zillow data/new_submission1.csv", index=False)

          Parcelid    201610    201611    201612    201710    201711    201712
0         10754147  0.028049  0.028431  0.029355  0.049639  0.049639  0.049639
1         10759547 -0.004185 -0.003239 -0.001811  0.013211  0.013211  0.013211
2         10843547  0.049635  0.049635  0.049635  0.103998  0.103998  0.103998
3         10859147  0.092716  0.091380  0.091508  0.127437  0.127437  0.127437
4         10879947  0.027588  0.027588  0.028027  0.054558  0.054558  0.054558
5         10898347  0.032808  0.032840  0.032611  0.048364  0.048364  0.048364
6         10933547 -0.012797 -0.013247 -0.013965  0.025217  0.025217  0.025217
7         10940747  0.038732  0.038614  0.040091  0.058608  0.058608  0.058608
8         10954547  0.043967  0.044350  0.045274  0.064318  0.064318  0.064318
9         10976347 -0.000247 -0.000357  0.001120  0.029871  0.029871  0.029871
10        11073947  0.003286  0.003830  0.004527  0.025780  0.025780  0.025780
11        11114347  0.018199  0.018010  0.018200  0.

In [131]:
df_merged.to_csv("./zillow data/new_draft_submission1.csv", index=False)