In [2]:
import numpy as np
import pandas as pd

import lightgbm as lgb

from sklearn.feature_selection import VarianceThreshold

from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import time
import datetime as dt
import gc

In [3]:
train_datapath_2016 = "./zillow data/merged_2016 v4.csv"
train_datapath_2017 = "./zillow data/merged_2017 v4.csv"

test_datapath_2016 = "./zillow data/properties_2016 v4.csv"
test_datapath_2017 = "./zillow data/properties_2017 v4.csv"

In [55]:
# modify 'transactiondate' of df to keep only the month
def parse_transactiondate(df):
    # modify transactiondate col to keep only the month
    transactiondate_year = []
    transactiondate_month = [] 
    transactiondate_quarter = []
    for data in df['transactiondate']:
        temp = data.split("-")
        year = int(temp[0])
        transactiondate_year.append(year) # keep only month
        month = int(temp[1])
        month = (year - 2016)*12 + month
        transactiondate_month.append(month)
        quarter = (year-2016)*4 + int((month-1)/3) + 1
        transactiondate_quarter.append(quarter)
        
    transactiondate_year = np.array(transactiondate_year)
    transactiondate_year = transactiondate_year.reshape(transactiondate_year.shape[0], 1)
    transactiondate_month = np.array(transactiondate_month)
    transactiondate_month = transactiondate_month.reshape(transactiondate_month.shape[0], 1)
    transactiondate_quarter = np.array(transactiondate_quarter)
    transactiondate_quarter = transactiondate_month.reshape(transactiondate_quarter.shape[0], 1)

    df['transactiondate_year'] = transactiondate_year
    df['transactiondate_month'] = transactiondate_month
    df['transactiondate_quarter'] = transactiondate_quarter
    df = df.drop('transactiondate',axis = 1)
    return df

# return label index, feature index list
# assumes that df label_is_first
def split_on_label(df):
    return df.columns[0], df.columns[1:]

def gen_testdata(df, transactiondate_year,transactiondate_month,transactiondate_quarter):
    df_copy = df.copy()
    # df_copy['transactiondate'] = new_transactiondate
    df_copy.insert(0, 'transactiondate_year', transactiondate_year)
    df_copy.insert(0, 'transactiondate_month', transactiondate_month)
    df_copy.insert(0, 'transactiondate_quarter', transactiondate_quarter)
    return df_copy

def get_low_var_feature(support_list):
    low_var_feature_index = []
    support_feature_index = []
    for i in range(len(support_list)):
        if support_list[i]:
            support_feature_index.append(i)
        else:
            low_var_feature_index.append(i)
    return low_var_feature_index, support_feature_index


def handle_low_var(sel, train_data, feature_list):
    temp_sel = sel.fit(train_data)
    low_var_list, support_list = get_low_var_feature(sel.get_support())

    return low_var_list


# Removing Features with more than 98% missing data
def removing_missing(X_train, missing_threshold = 0.95):
    exclude_missing = []
    num_rows = X_train.shape[0]
    for col in X_train.columns:
        num_missing = X_train[col].isna().sum()
        if num_missing == 0:
            continue
        missing_fraction = num_missing/float(num_rows)
        if missing_fraction > missing_threshold:
            exclude_missing.append(col)
            
    return exclude_missing

# removing features with unique value
def removing_unique(X_train):
    exclude_unique = []
    for col in X_train.columns:
        num_unique = len(X_train[col].unique())
        if X_train[col].isna().sum()!=0 and num_unique == 1:
            exclude_unique.append(col)
            
    return exclude_unique

def print_list(li,list_name = ''):
    print('%s (%d) :' %(list_name,len(li)))
    for i in li:
        print(i)
        
def get_train_features(X_train,el1 = [], el2 = [], el3 = [],el4 = []):
    train_feature = []
    for col in X_train:
        if col not in el1 and col not in el2 and col not in el3 and col not in el4:
            train_feature.append(col)
    return train_feature

# geting categorical features
def get_cat_feature(X_train, cat_threshold):
    cat_feature_inds = []
    cat_feature = []
    for i,col in enumerate(X_train):
        num_uniques = len(X_train[col].unique())
        if num_uniques < cat_threshold and not 'sqft' in col and not 'cnt' in col \
        and not 'nbr' in col and not 'number' in col:
            cat_feature_inds.append(i)
            cat_feature.append(col)
    return cat_feature_inds,cat_feature

In [72]:
# read training data
train_data16 = pd.read_csv(train_datapath_2016, index_col=0)
train_data17 = pd.read_csv(train_datapath_2017, index_col=0)

# drop parcelid (index of properties features)
train_data16 = train_data16.drop('parcelid', axis=1)
train_data17 = train_data17.drop('parcelid', axis=1)

# modify 'transactiondate' to keep only the month
train_data16 = parse_transactiondate(train_data16)
train_data17 = parse_transactiondate(train_data17)

train_data = pd.concat([train_data16,train_data17],axis = 0)

label, feature = split_on_label(train_data)

# training data and label for model 2016
X_train = train_data[feature]

y_train = train_data[label]

del train_data16, train_data17
gc.collect()

print("[2016] num of features:", len(feature))
print("[2016] num of instances:", train_data.shape[0])

[2016] num of features: 60
[2016] num of instances: 167888


In [73]:
# Removing Freatures with too many missing data
exclude_missing = removing_missing(X_train)

# Removing Features with unique values
exclude_unique = removing_unique(X_train)

# Removing Features with low variance
# it should remove features with many missing values that were filled by imputation
#sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

#exclude_low_var16 = handle_low_var(sel, X_train16, feature16)
#exclude_low_var17 = handle_low_var(sel, X_train17, feature17)

In [77]:
X_train.fillna(-999,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [74]:
print_list(exclude_missing,'exclude_missing')
#print_list(exclude_low_var16,'exclude_low_var16')
print_list(exclude_unique,'exclude_unique')

exclude_missing (18) :
architecturalstyletypeid
basementsqft
buildingclasstypeid
decktypeid
finishedsquarefeet13
finishedsquarefeet15
finishedsquarefeet6
hashottuborspa
poolsizesum
pooltypeid10
pooltypeid2
storytypeid
typeconstructiontypeid
yardbuildingsqft17
yardbuildingsqft26
fireplaceflag
taxdelinquencyflag
taxdelinquencyyear
exclude_unique (0) :


In [75]:
train_feature = get_train_features(X_train, el1 = exclude_missing, 
                                                #el2 = exclude_low_var16,
                                                el3 = exclude_unique)

new_X_train = X_train[train_feature]

print("\n[2016] num of features:", new_X_train.shape[1])
print_list(train_feature,"train_feature16")


[2016] num of features: 42
train_feature16 (42) :
airconditioningtypeid
bathroomcnt
bedroomcnt
buildingqualitytypeid
calculatedbathnbr
finishedfloor1squarefeet
calculatedfinishedsquarefeet
finishedsquarefeet12
finishedsquarefeet50
fips
fireplacecnt
fullbathcnt
garagecarcnt
garagetotalsqft
heatingorsystemtypeid
latitude
longitude
lotsizesquarefeet
poolcnt
pooltypeid7
propertycountylandusecode
propertylandusetypeid
propertyzoningdesc
rawcensustractandblock
regionidcity
regionidcounty
regionidneighborhood
regionidzip
roomcnt
threequarterbathnbr
unitcnt
yearbuilt
numberofstories
structuretaxvaluedollarcnt
taxvaluedollarcnt
assessmentyear
landtaxvaluedollarcnt
taxamount
censustractandblock
transactiondate_year
transactiondate_month
transactiondate_quarter


In [76]:
cat_feature_inds, cat_feature = get_cat_feature(new_X_train, cat_threshold = 1000)
print_list(cat_feature,'cat_feature')

cat_feature (16) :
airconditioningtypeid
buildingqualitytypeid
fips
heatingorsystemtypeid
pooltypeid7
propertycountylandusecode
propertylandusetypeid
regionidcity
regionidcounty
regionidneighborhood
regionidzip
yearbuilt
assessmentyear
transactiondate_year
transactiondate_month
transactiondate_quarter


### Feauture Selection

In [None]:
from sklearn.model_selection import RandomSearchCV
def handle_lightgbm_RCV(X, y, param_dict,N):
    
    # cross-validate on alpha (regularization strenght) from alphalist
    
    gbm = lgb.LGBMRegressor(n_jobs = -1, random_state = 42,silent = False)
       
    # cross-validate on random search CV
    random_search = RandomSearchCV(gbm,param_distributions=param_dict,
                                 n_iter = N,random_state = 21,
                                 cv=5, scoring='neg_mean_absolute_error',verbose = True)
    
    start = time.time()
    grid_search.fit(X, y)
    print("RandomSearchCV took %.2f seconds"
          " parameter settings." % (time.time() - start))

    return random_search

In [30]:
def handle_lightgbm_GridCV(X, y, param_dict):
    
    # cross-validate on alpha (regularization strenght) from alphalist
    
    gbm = lgb.LGBMRegressor(n_jobs = -1, random_state = 42,silent = False)
       
    # cross-validate on random search CV
    grid_search = GridSearchCV(gbm,param_grid=param_dict,
                               cv=5, scoring='neg_mean_absolute_error',verbose = True)
    
    start = time.time()
    grid_search.fit(X, y)
    print("GridSearchCV took %.2f seconds"
          " parameter settings." % (time.time() - start))

    return grid_search

In [80]:
params_dict = {
    "learning_rate":[0.014],
    'num_leaves':[140],
    'max_depth':[-1],
    'reg_alpha':[0.01],
    'reg_lambda':[100],
    'min_split_gain':[0],
    'min_child_samples':[100],
    'colsample_bytree':[0.8],
    
    'min_child_weight':[0.001],
    'subsample':[0.8],
    'subsample_for_bin':[50000],
    'subsample_freq':[1], 
    'n_eatimators':[512],
    'ramdon_state':[21]
}
grid_result = handle_lightgbm_GridCV(new_X_train, y_train, params_dict)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 15.9min finished


GridSearchCV took 961.70 seconds parameter settings.


In [81]:
grid_result.grid_scores_



[mean: -0.06937, std: 0.00190, params: {'colsample_bytree': 0.8, 'learning_rate': 0.001, 'max_depth': -1, 'min_child_samples': 100, 'min_child_weight': 0.001, 'min_split_gain': 0, 'n_eatimators': 512, 'num_leaves': 140, 'ramdon_state': 21, 'reg_alpha': 0.01, 'reg_lambda': 100, 'subsample': 0.8, 'subsample_for_bin': 50000, 'subsample_freq': 1},
 mean: -0.06932, std: 0.00189, params: {'colsample_bytree': 0.8, 'learning_rate': 0.0014384498882876629, 'max_depth': -1, 'min_child_samples': 100, 'min_child_weight': 0.001, 'min_split_gain': 0, 'n_eatimators': 512, 'num_leaves': 140, 'ramdon_state': 21, 'reg_alpha': 0.01, 'reg_lambda': 100, 'subsample': 0.8, 'subsample_for_bin': 50000, 'subsample_freq': 1},
 mean: -0.06926, std: 0.00188, params: {'colsample_bytree': 0.8, 'learning_rate': 0.00206913808111479, 'max_depth': -1, 'min_child_samples': 100, 'min_child_weight': 0.001, 'min_split_gain': 0, 'n_eatimators': 512, 'num_leaves': 140, 'ramdon_state': 21, 'reg_alpha': 0.01, 'reg_lambda': 100, 

### Learning Curve

In [51]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve



In [50]:
def plot_learning_curve(estimator, X, y, ylim=None, cv=None, train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
    
    train_sizes, train_scores, test_scores = learning_curve(
            estimator, X, y, cv=cv, train_sizes=train_sizes, scoring = 'neg_mean_absolute_error',verbose=verbose)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    if plot:
        plt.figure()
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel(u"number of samples")
        plt.ylabel(u"score")
        plt.gca().invert_yaxis()
        plt.grid()
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                         alpha=0.1, color="b")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"training set score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"validation set score")

        plt.legend(loc="best")

        plt.draw()
        plt.show()
        plt.gca().invert_yaxis()

    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

In [52]:
bst = lgb.LGBMRegressor(n_jobs = -1, random_state = 42,silent = False,
                            learning_rate = 0.014,
                            n_estimators = 512,
                            num_leaves = 140,
                            reg_alpha = 0.01,
                            reg_lambda = 100,
                            min_split_gain = 0,
                            min_child_samples = 100,
                            subsample = 0.8,
                            subsample_for_bin = 50000,
                            colsample_bytree = 0.8,
                            subsample_freq = 1,
                            max_depth = -1,
                            min_child_weight = 0.001
                        
                           )
start = time.time()
_,_ = plot_learning_curve(bst, new_X_train, y_train)
during_time = time.time() - start
print(during_time)

KeyboardInterrupt: 

### Testing

In [56]:
# read in test
test_data2016 = pd.read_csv(test_datapath_2016, index_col=0)
test_data2017 = pd.read_csv(test_datapath_2017, index_col=0)

print("[2016] num of instances: ", test_data2016.shape[0])
print("[2017] num of instances: ", test_data2017.shape[0])

# save parcelid for merge 
test_parcelid16 = test_data2016['parcelid']
test_parcelid17 = test_data2017['parcelid']

# drop parcelid col
test_data2016 = test_data2016.drop('parcelid', axis=1)
test_data2017 = test_data2017.drop('parcelid', axis=1)

# generate transaction date
test_year = np.repeat(2016, test_data2016.shape[0])
test_quarter = np.repeat(4, test_data2016.shape[0])

test_month10_16 = np.repeat(10, test_data2016.shape[0])
test_month11_16 = np.repeat(11, test_data2016.shape[0])
test_month12_16 = np.repeat(12, test_data2016.shape[0])

test_month10_17 = np.repeat(22, test_data2017.shape[0])
test_month11_17 = np.repeat(23, test_data2017.shape[0])
test_month12_17 = np.repeat(24, test_data2017.shape[0])

# get new test data with transaction date
X_test10_16 = gen_testdata(test_data2016, test_year, test_month10_16,test_quarter)
X_test11_16 = gen_testdata(test_data2016, test_year, test_month11_16,test_quarter)
X_test12_16 = gen_testdata(test_data2016, test_year, test_month12_16,test_quarter)

X_test10_17 = gen_testdata(test_data2017, test_year, test_month10_17,test_quarter)
X_test11_17 = gen_testdata(test_data2017, test_year, test_month11_17,test_quarter)
X_test12_17 = gen_testdata(test_data2017, test_year, test_month12_17,test_quarter)

  mask |= (ar1 == a)


[2016] num of instances:  2985217
[2017] num of instances:  2985217


In [57]:
X_test10_16 = X_test10_16[train_feature]
X_test11_16 = X_test11_16[train_feature]
X_test12_16 = X_test12_16[train_feature]
X_test10_17 = X_test10_17[train_feature]
X_test11_17 = X_test11_17[train_feature]
X_test12_17 = X_test12_17[train_feature]

In [63]:
def train_and_test(X_train,y_train,X_test):
    start = time.time()
    bst = lgb.LGBMRegressor(n_jobs = -1, random_state = 42,silent = False,
                            learning_rate = 0.014,
                            n_estimators = 512,
                            num_leaves = 140,
                            reg_alpha = 0.01,
                            reg_lambda = 100,
                            min_split_gain = 0,
                            min_child_samples = 100,
                            subsample = 0.8,
                            subsample_for_bin = 50000,
                            colsample_bytree = 0.8,
                            subsample_freq = 1,
                            max_depth  = -1,
                            min_child_weight = 0.001
                           )
    bst.fit(X_train, y_train)
    y_pred = bst.predict(X_test)
    print('time using:%.2f'%(time.time()-start))
    
    return y_pred

In [64]:
y_pred10_16 = train_and_test(new_X_train, y_train, X_test10_16.values)
y_pred11_16 = train_and_test(new_X_train, y_train, X_test11_16.values)
y_pred12_16 = train_and_test(new_X_train, y_train, X_test12_16.values)
y_pred10_17 = train_and_test(new_X_train, y_train, X_test10_17.values)
y_pred11_17 = train_and_test(new_X_train, y_train, X_test11_17.values)
y_pred12_17 = train_and_test(new_X_train, y_train, X_test12_17.values)

time using:222.55
time using:196.50
time using:217.11
time using:168.77
time using:187.18
time using:185.88


In [65]:
# merged on Parcelid for predicted result on test data

test_dict_16 = {'Parcelid': test_parcelid16, '201610': y_pred10_16, '201611': y_pred11_16, '201612': y_pred12_16}
test_dict_17 = {'Parcelid': test_parcelid17, '201710': y_pred10_17, '201711': y_pred11_17, '201712': y_pred12_17}

df_test_16 = pd.DataFrame(data=test_dict_16)
df_test_17 = pd.DataFrame(data=test_dict_17)

df_merged = df_test_16.merge(df_test_17, left_on='Parcelid', right_on='Parcelid', how='outer')
print(df_merged)

# handle submission file
submitfile = "./zillow data/sample_submission.csv"

submit_df = pd.read_csv(submitfile)
print(submit_df.shape[0], submit_df.shape[1])

assert(submit_df.shape[0] == df_merged.shape[0]), "Error: invalid row size for submit!"
df_merged.to_csv("./zillow data/new_submission1.csv", index=False)

          Parcelid    201610    201611    201612    201710    201711    201712
0         10754147  0.028049  0.028431  0.029355  0.049639  0.049639  0.049639
1         10759547 -0.004185 -0.003239 -0.001811  0.013211  0.013211  0.013211
2         10843547  0.049635  0.049635  0.049635  0.103998  0.103998  0.103998
3         10859147  0.092716  0.091380  0.091508  0.127437  0.127437  0.127437
4         10879947  0.027588  0.027588  0.028027  0.054558  0.054558  0.054558
5         10898347  0.032808  0.032840  0.032611  0.048364  0.048364  0.048364
6         10933547 -0.012797 -0.013247 -0.013965  0.025217  0.025217  0.025217
7         10940747  0.038732  0.038614  0.040091  0.058608  0.058608  0.058608
8         10954547  0.043967  0.044350  0.045274  0.064318  0.064318  0.064318
9         10976347 -0.000247 -0.000357  0.001120  0.029871  0.029871  0.029871
10        11073947  0.003286  0.003830  0.004527  0.025780  0.025780  0.025780
11        11114347  0.018199  0.018010  0.018200  0.

In [131]:
df_merged.to_csv("./zillow data/new_draft_submission1.csv", index=False)