In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import gc
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

print('Loading data ...')
path = 'C:\\Kaggle\\Zillow\\input\\'

train = pd.read_csv('C:\\Kaggle\\Zillow\\input\\train_2016.csv')
properties = pd.read_csv('C:\\Kaggle\\Zillow\\input\\properties_2016.csv')

for c in properties.columns:
    properties[c]=properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))
df_train = train.merge(properties, how='left', on='parcelid')

In [None]:
#life of property
def prepare_additional_feature(df_train):
    df_train['N-life'] = 2018 - df_train['yearbuilt']

    #error in calculation of the finished living area of home
    df_train['N-LivingAreaError'] = df_train['calculatedfinishedsquarefeet']/df_train['finishedsquarefeet12']

    #proportion of living area
    df_train['N-LivingAreaProp'] = df_train['calculatedfinishedsquarefeet']/df_train['lotsizesquarefeet']
    df_train['N-LivingAreaProp2'] = df_train['finishedsquarefeet12']/df_train['finishedsquarefeet15']

    #Amout of extra space
    df_train['N-ExtraSpace'] = df_train['lotsizesquarefeet'] - df_train['calculatedfinishedsquarefeet'] 
    df_train['N-ExtraSpace-2'] = df_train['finishedsquarefeet15'] - df_train['finishedsquarefeet12'] 

    #Total number of rooms
    df_train['N-TotalRooms'] = df_train['bathroomcnt']*df_train['bedroomcnt']

    #Average room size
    df_train['N-AvRoomSize'] = df_train['calculatedfinishedsquarefeet']/df_train['roomcnt'] 

    # Number of Extra rooms
    df_train['N-ExtraRooms'] = df_train['roomcnt'] - df_train['N-TotalRooms'] 

    #Ratio of the built structure value to land area
    df_train['N-ValueProp'] = df_train['structuretaxvaluedollarcnt']/df_train['landtaxvaluedollarcnt']

    #Does property have a garage, pool or hot tub and AC?
    df_train['N-GarPoolAC'] = ((df_train['garagecarcnt']>0) & (df_train['pooltypeid10']>0) & (df_train['airconditioningtypeid']!=5))*1 

    df_train["N-location"] = df_train["latitude"] + df_train["longitude"]
    df_train["N-location-2"] = df_train["latitude"]*df_train["longitude"]
    df_train["N-location-2round"] = df_train["N-location-2"].round(-4)

    df_train["N-latitude-round"] = df_train["latitude"].round(-4)
    df_train["N-longitude-round"] = df_train["longitude"].round(-4)
    
    #Ratio of tax of property over parcel
    df_train['N-ValueRatio'] = df_train['taxvaluedollarcnt']/df_train['taxamount']

    #TotalTaxScore
    df_train['N-TaxScore'] = df_train['taxvaluedollarcnt']*df_train['taxamount']

    #polnomials of tax delinquency year
    df_train["N-taxdelinquencyyear-2"] = df_train["taxdelinquencyyear"] ** 2
    df_train["N-taxdelinquencyyear-3"] = df_train["taxdelinquencyyear"] ** 3

    #Length of time since unpaid taxes
    df_train['N-life'] = 2018 - df_train['taxdelinquencyyear']
    
    #Number of properties in the zip
    zip_count = df_train['regionidzip'].value_counts().to_dict()
    df_train['N-zip_count'] = df_train['regionidzip'].map(zip_count)

    #Number of properties in the city
    city_count = df_train['regionidcity'].value_counts().to_dict()
    df_train['N-city_count'] = df_train['regionidcity'].map(city_count)

    #Number of properties in the city
    region_count = df_train['regionidcounty'].value_counts().to_dict()
    df_train['N-county_count'] = df_train['regionidcounty'].map(city_count)
    
        #Indicator whether it has AC or not
    df_train['N-ACInd'] = (df_train['airconditioningtypeid']!=5)*1

    #Indicator whether it has Heating or not 
    df_train['N-HeatInd'] = (df_train['heatingorsystemtypeid']!=13)*1

    #There's 25 different property uses - let's compress them down to 4 categories
    df_train['N-PropType'] = df_train.propertylandusetypeid.replace({31 : "Mixed", 
                                                                     46 : "Other", 47 : "Mixed", 246 : "Mixed", 
                                                                     247 : "Mixed", 248 : "Mixed", 260 : "Home", 
                                                                     261 : "Home", 262 : "Home", 263 : "Home", 
                                                                     264 : "Home", 265 : "Home", 266 : "Home", 
                                                                     267 : "Home", 268 : "Home", 269 : "Not Built", 
                                                                     270 : "Home", 271 : "Home", 273 : "Home", 
                                                                     274 : "Other", 275 : "Home", 276 : "Home", 
                                                                     279 : "Home", 290 : "Not Built", 
                                                                     291 : "Not Built" })
    
    #polnomials of the variable
    df_train["N-structuretaxvaluedollarcnt-2"] = df_train["structuretaxvaluedollarcnt"] ** 2
    df_train["N-structuretaxvaluedollarcnt-3"] = df_train["structuretaxvaluedollarcnt"] ** 3

    #Average structuretaxvaluedollarcnt by city
    group = df_train.groupby('regionidcity')['structuretaxvaluedollarcnt'].aggregate('mean').to_dict()
    df_train['N-Avg-structuretaxvaluedollarcnt'] = df_train['regionidcity'].map(group)

    #Deviation away from average
    df_train['N-Dev-structuretaxvaluedollarcnt'] = abs((df_train['structuretaxvaluedollarcnt'] - 
                                                        df_train['N-Avg-structuretaxvaluedollarcnt']))/df_train['N-Avg-structuretaxvaluedollarcnt']
    
    return df_train

In [3]:
df_train = prepare_additional_feature(df_train)

In [4]:
features = ['airconditioningtypeid', 'bathroomcnt', 'bedroomcnt',
           'buildingqualitytypeid', 'calculatedbathnbr',
           'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet',
           'finishedsquarefeet12', 'finishedsquarefeet15',
           'finishedsquarefeet50', 'fips', 'fullbathcnt', 'garagetotalsqft',
           'hashottuborspa', 'heatingorsystemtypeid', 'latitude',
           'lotsizesquarefeet', 'poolcnt', 'pooltypeid7',
           'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity',
           'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt',
           'unitcnt', 'yearbuilt', 'numberofstories',
           'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
           'landtaxvaluedollarcnt', 'taxamount', 'taxdelinquencyyear',
           'censustractandblock', 'N-LivingAreaProp', 'N-LivingAreaProp2',
           'N-ExtraSpace', 'N-ExtraSpace-2', 'N-TotalRooms', 'N-AvRoomSize',
           'N-ExtraRooms', 'N-ValueProp', 'N-latitude-round', 'N-ValueRatio',
           'N-TaxScore', 'N-taxdelinquencyyear-2', 'N-zip_count',
           'N-city_count', 'N-structuretaxvaluedollarcnt-2',
           'N-structuretaxvaluedollarcnt-3',
           'N-Avg-structuretaxvaluedollarcnt',
           'N-Dev-structuretaxvaluedollarcnt']

categorical_f = ['airconditioningtypeid', 'architecturalstyletypeid', 'buildingclasstypeid', 'heatingorsystemtypeid',
                 'propertycountylandusecode', 'propertylandusetypeid', 'storytypeid', 'typeconstructiontypeid', 
                 'regionidcounty', 'regionidcity','regionidzip','regionidneighborhood'
                ]

In [5]:

#df_train = df_train[features]
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], 
                        axis=1, errors = 'ignore')
y_train = df_train['logerror'].values
x_train = x_train[features]

print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()

#split = 80000
#index = np.arange(y_train.shape[0])
#np.random.shuffle(index)
#x_train = x_train.loc[index]
#y_train = y_train[index]
#y_train[y_train>0]=1
#y_train[y_train<=0]=0
#x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]
#x_train = x_train.values.astype(np.float32, copy=False)
#x_valid = x_valid.values.astype(np.float32, copy=False)

(90275, 53) (90275,)


94

In [6]:
from sklearn.svm import LinearSVR
from sklearn.model_selection import KFold

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'mae'},
    'num_leaves': 96,
    'min_sum_hessian_in_leaf':20,
    'min_hessian':10,
    'min_data':500,
    'max_depth': -12,
    'learning_rate': 0.03,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
    'verbose': 0
    }
    
kf = KFold(n_splits=10)
average_loss = 0
for train_index, valid_index in kf.split(x_train):
    train_data, valid_data = x_train.loc[train_index], x_train.loc[valid_index]
    train_label, valid_label = y_train[train_index], y_train[valid_index]
    
    model = LinearSVR()
    model.fit(train_data.fillna(0), train_label)
    y_bar = model.predict(valid_data.fillna(0))
    loss = np.mean(np.abs(y_bar-valid_label)) 
    print(loss)
    average_loss += loss
average_loss/10

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [10]:
from sklearn.svm import LinearSVR
from sklearn.model_selection import KFold

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'mae'},
    'num_leaves': 96,
    'min_sum_hessian_in_leaf':20,
    'min_hessian':10,
    'min_data':500,
    'max_depth': -12,
    'learning_rate': 0.03,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
    'verbose': 0
    }
    
kf = KFold(n_splits=10)
average_loss = 0
for train_index, valid_index in kf.split(x_train):
    train_data, valid_data = x_train.loc[train_index], x_train.loc[valid_index]
    train_label, valid_label = y_train[train_index], y_train[valid_index]
    
    outlier_bracket = (train_label<=0.5)&(train_label>=-0.5)
    train_data = train_data[outlier_bracket]
    train_label = train_label[outlier_bracket]
    
    d_train = lgb.Dataset(train_data, label=train_label)
    d_valid = lgb.Dataset(valid_data, label=valid_label)

    watchlist = [d_train, d_valid]
    gbm = lgb.train(params, d_train, num_boost_round=2000, valid_sets=d_valid, early_stopping_rounds=100)
    average_loss += gbm.eval_valid()[0][2]
average_loss/10

[1]	valid_0's l1: 0.0748197
Train until valid scores didn't improve in 100 rounds.
[2]	valid_0's l1: 0.0747816
[3]	valid_0's l1: 0.0747415
[4]	valid_0's l1: 0.0747045
[5]	valid_0's l1: 0.0746687
[6]	valid_0's l1: 0.0746406
[7]	valid_0's l1: 0.0746228
[8]	valid_0's l1: 0.0745933
[9]	valid_0's l1: 0.0745631
[10]	valid_0's l1: 0.0745412
[11]	valid_0's l1: 0.0745115
[12]	valid_0's l1: 0.0744879
[13]	valid_0's l1: 0.0744648
[14]	valid_0's l1: 0.0744438
[15]	valid_0's l1: 0.0744216
[16]	valid_0's l1: 0.0744037
[17]	valid_0's l1: 0.0743919
[18]	valid_0's l1: 0.0743727
[19]	valid_0's l1: 0.0743599
[20]	valid_0's l1: 0.0743473
[21]	valid_0's l1: 0.0743353
[22]	valid_0's l1: 0.0743219
[23]	valid_0's l1: 0.0743109
[24]	valid_0's l1: 0.074301
[25]	valid_0's l1: 0.0742911
[26]	valid_0's l1: 0.0742835
[27]	valid_0's l1: 0.0742785
[28]	valid_0's l1: 0.0742719
[29]	valid_0's l1: 0.0742653
[30]	valid_0's l1: 0.0742676
[31]	valid_0's l1: 0.0742597
[32]	valid_0's l1: 0.0742547
[33]	valid_0's l1: 0.074244

[122]	valid_0's l1: 0.0721167
[123]	valid_0's l1: 0.0721219
[124]	valid_0's l1: 0.0721184
[125]	valid_0's l1: 0.0721208
[126]	valid_0's l1: 0.0721213
[127]	valid_0's l1: 0.0721237
[128]	valid_0's l1: 0.0721277
[129]	valid_0's l1: 0.0721223
[130]	valid_0's l1: 0.0721218
[131]	valid_0's l1: 0.0721236
[132]	valid_0's l1: 0.0721272
[133]	valid_0's l1: 0.0721283
[134]	valid_0's l1: 0.0721308
[135]	valid_0's l1: 0.0721322
[136]	valid_0's l1: 0.0721325
[137]	valid_0's l1: 0.0721344
[138]	valid_0's l1: 0.0721332
[139]	valid_0's l1: 0.0721339
[140]	valid_0's l1: 0.0721369
[141]	valid_0's l1: 0.0721423
[142]	valid_0's l1: 0.0721445
[143]	valid_0's l1: 0.0721442
[144]	valid_0's l1: 0.0721483
[145]	valid_0's l1: 0.0721514
[146]	valid_0's l1: 0.0721554
[147]	valid_0's l1: 0.0721508
[148]	valid_0's l1: 0.0721514
[149]	valid_0's l1: 0.0721519
[150]	valid_0's l1: 0.0721514
[151]	valid_0's l1: 0.0721498
[152]	valid_0's l1: 0.0721502
[153]	valid_0's l1: 0.072148
[154]	valid_0's l1: 0.072144
[155]	valid_

[15]	valid_0's l1: 0.0665234
[16]	valid_0's l1: 0.0665066
[17]	valid_0's l1: 0.0664909
[18]	valid_0's l1: 0.0664884
[19]	valid_0's l1: 0.0664705
[20]	valid_0's l1: 0.0664565
[21]	valid_0's l1: 0.0664358
[22]	valid_0's l1: 0.066416
[23]	valid_0's l1: 0.0663974
[24]	valid_0's l1: 0.0663819
[25]	valid_0's l1: 0.0663673
[26]	valid_0's l1: 0.0663551
[27]	valid_0's l1: 0.0663436
[28]	valid_0's l1: 0.0663353
[29]	valid_0's l1: 0.066325
[30]	valid_0's l1: 0.0663134
[31]	valid_0's l1: 0.0663029
[32]	valid_0's l1: 0.0662977
[33]	valid_0's l1: 0.0662908
[34]	valid_0's l1: 0.0662864
[35]	valid_0's l1: 0.0662877
[36]	valid_0's l1: 0.0662878
[37]	valid_0's l1: 0.0662837
[38]	valid_0's l1: 0.066276
[39]	valid_0's l1: 0.0662678
[40]	valid_0's l1: 0.0662657
[41]	valid_0's l1: 0.0662608
[42]	valid_0's l1: 0.0662534
[43]	valid_0's l1: 0.0662471
[44]	valid_0's l1: 0.0662462
[45]	valid_0's l1: 0.0662415
[46]	valid_0's l1: 0.0662387
[47]	valid_0's l1: 0.0662306
[48]	valid_0's l1: 0.0662253
[49]	valid_0's l1

[119]	valid_0's l1: 0.0683259
[120]	valid_0's l1: 0.0683241
[121]	valid_0's l1: 0.0683224
[122]	valid_0's l1: 0.0683214
[123]	valid_0's l1: 0.0683225
[124]	valid_0's l1: 0.0683254
[125]	valid_0's l1: 0.0683259
[126]	valid_0's l1: 0.0683264
[127]	valid_0's l1: 0.068322
[128]	valid_0's l1: 0.0683226
[129]	valid_0's l1: 0.0683217
[130]	valid_0's l1: 0.0683233
[131]	valid_0's l1: 0.0683268
[132]	valid_0's l1: 0.0683257
[133]	valid_0's l1: 0.0683281
[134]	valid_0's l1: 0.0683196
[135]	valid_0's l1: 0.0683222
[136]	valid_0's l1: 0.0683269
[137]	valid_0's l1: 0.0683304
[138]	valid_0's l1: 0.0683331
[139]	valid_0's l1: 0.068337
[140]	valid_0's l1: 0.0683394
[141]	valid_0's l1: 0.0683419
[142]	valid_0's l1: 0.0683362
[143]	valid_0's l1: 0.0683402
[144]	valid_0's l1: 0.0683422
[145]	valid_0's l1: 0.0683421
[146]	valid_0's l1: 0.0683393
[147]	valid_0's l1: 0.0683403
[148]	valid_0's l1: 0.0683438
[149]	valid_0's l1: 0.0683459
[150]	valid_0's l1: 0.0683479
[151]	valid_0's l1: 0.0683534
[152]	valid_

[11]	valid_0's l1: 0.0650401
[12]	valid_0's l1: 0.0650197
[13]	valid_0's l1: 0.0650005
[14]	valid_0's l1: 0.0649803
[15]	valid_0's l1: 0.0649655
[16]	valid_0's l1: 0.0649551
[17]	valid_0's l1: 0.0649473
[18]	valid_0's l1: 0.0649401
[19]	valid_0's l1: 0.0649283
[20]	valid_0's l1: 0.0649243
[21]	valid_0's l1: 0.0649099
[22]	valid_0's l1: 0.0649022
[23]	valid_0's l1: 0.0648915
[24]	valid_0's l1: 0.064882
[25]	valid_0's l1: 0.0648713
[26]	valid_0's l1: 0.0648548
[27]	valid_0's l1: 0.0648486
[28]	valid_0's l1: 0.0648442
[29]	valid_0's l1: 0.0648305
[30]	valid_0's l1: 0.0648189
[31]	valid_0's l1: 0.0648139
[32]	valid_0's l1: 0.0648098
[33]	valid_0's l1: 0.0647992
[34]	valid_0's l1: 0.0647915
[35]	valid_0's l1: 0.0647891
[36]	valid_0's l1: 0.0647805
[37]	valid_0's l1: 0.0647729
[38]	valid_0's l1: 0.0647585
[39]	valid_0's l1: 0.0647521
[40]	valid_0's l1: 0.0647498
[41]	valid_0's l1: 0.0647416
[42]	valid_0's l1: 0.06474
[43]	valid_0's l1: 0.0647345
[44]	valid_0's l1: 0.0647303
[45]	valid_0's l1

[98]	valid_0's l1: 0.0651741
[99]	valid_0's l1: 0.0651752
[100]	valid_0's l1: 0.0651769
[101]	valid_0's l1: 0.0651729
[102]	valid_0's l1: 0.065175
[103]	valid_0's l1: 0.0651779
[104]	valid_0's l1: 0.0651845
[105]	valid_0's l1: 0.065185
[106]	valid_0's l1: 0.0651867
[107]	valid_0's l1: 0.0651926
[108]	valid_0's l1: 0.0651912
[109]	valid_0's l1: 0.0651909
[110]	valid_0's l1: 0.065189
[111]	valid_0's l1: 0.0651855
[112]	valid_0's l1: 0.0651866
[113]	valid_0's l1: 0.0651903
[114]	valid_0's l1: 0.0651865
[115]	valid_0's l1: 0.0651901
[116]	valid_0's l1: 0.0651889
[117]	valid_0's l1: 0.0651927
[118]	valid_0's l1: 0.0651933
[119]	valid_0's l1: 0.0651923
[120]	valid_0's l1: 0.0651934
[121]	valid_0's l1: 0.0651881
[122]	valid_0's l1: 0.0651816
[123]	valid_0's l1: 0.0651822
[124]	valid_0's l1: 0.0651841
[125]	valid_0's l1: 0.0651846
[126]	valid_0's l1: 0.0651853
[127]	valid_0's l1: 0.0651853
[128]	valid_0's l1: 0.0651865
[129]	valid_0's l1: 0.0651863
[130]	valid_0's l1: 0.0651922
[131]	valid_0's

[172]	valid_0's l1: 0.0653299
[173]	valid_0's l1: 0.065332
[174]	valid_0's l1: 0.0653363
[175]	valid_0's l1: 0.0653418
[176]	valid_0's l1: 0.0653416
[177]	valid_0's l1: 0.0653394
[178]	valid_0's l1: 0.0653371
[179]	valid_0's l1: 0.0653335
[180]	valid_0's l1: 0.0653342
[181]	valid_0's l1: 0.0653364
[182]	valid_0's l1: 0.0653418
[183]	valid_0's l1: 0.0653424
[184]	valid_0's l1: 0.0653439
[185]	valid_0's l1: 0.0653495
[186]	valid_0's l1: 0.0653519
[187]	valid_0's l1: 0.0653561
[188]	valid_0's l1: 0.0653565
[189]	valid_0's l1: 0.0653578
[190]	valid_0's l1: 0.0653588
[191]	valid_0's l1: 0.0653579
[192]	valid_0's l1: 0.0653576
[193]	valid_0's l1: 0.0653595
[194]	valid_0's l1: 0.0653561
[195]	valid_0's l1: 0.0653528
[196]	valid_0's l1: 0.0653548
[197]	valid_0's l1: 0.0653624
[198]	valid_0's l1: 0.0653637
[199]	valid_0's l1: 0.0653636
[200]	valid_0's l1: 0.0653664
[201]	valid_0's l1: 0.0653711
[202]	valid_0's l1: 0.0653707
[203]	valid_0's l1: 0.06537
[204]	valid_0's l1: 0.0653715
[205]	valid_0

0.067578197327611431

In [None]:
df[df['importances']>0]['feature'].values

In [14]:
split = 90000
train_label = y_train[:split]
train_data = x_train[:split]
outlier_bracket = (train_label<=0.4)&(train_label>=-0.4)
train_data = train_data[outlier_bracket]
train_label = train_label[outlier_bracket]

d_train = lgb.Dataset(train_data, label=train_label)
d_valid = lgb.Dataset(x_train[split:], label=y_train[split:])

#params = {}
#params['learning_rate'] = 0.02
#params['boosting_type'] = 'gbdt'
#params['objective'] = 'regression'
#params['metric'] = 'mae'
#params['sub_feature'] = 0.5
#params['num_leaves'] = 96
#params['min_data'] = 500
#params['min_hessian'] = 10

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'mae'},
    'num_leaves': 128,
    'min_sum_hessian_in_leaf':20,
    'min_hessian':10,
    'min_data':500,
    'max_depth': 12,
    'learning_rate': 0.03,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 5,
    'verbose': 1
}

watchlist = [d_train, d_valid]
gbm = lgb.train(params, d_train, num_boost_round=2000, valid_sets=d_valid, early_stopping_rounds=30)

print('Feature names:', gbm.feature_name())

print('Calculate feature importances...')
# feature importances
print('Feature importances:', list(gbm.feature_importance()))

df = pd.DataFrame({'feature':gbm.feature_name(), 'importances': gbm.feature_importance()})
print(df.sort_values('importances'))

[1]	valid_0's l1: 0.0723964
Train until valid scores didn't improve in 30 rounds.
[2]	valid_0's l1: 0.0723481
[3]	valid_0's l1: 0.0723307
[4]	valid_0's l1: 0.0722714
[5]	valid_0's l1: 0.0722408
[6]	valid_0's l1: 0.0721983
[7]	valid_0's l1: 0.0721704
[8]	valid_0's l1: 0.0721378
[9]	valid_0's l1: 0.0721316
[10]	valid_0's l1: 0.0720899
[11]	valid_0's l1: 0.072078
[12]	valid_0's l1: 0.0720767
[13]	valid_0's l1: 0.0720876
[14]	valid_0's l1: 0.0720609
[15]	valid_0's l1: 0.0720396
[16]	valid_0's l1: 0.0720316
[17]	valid_0's l1: 0.07203
[18]	valid_0's l1: 0.0720127
[19]	valid_0's l1: 0.0719829
[20]	valid_0's l1: 0.0719495
[21]	valid_0's l1: 0.0719511
[22]	valid_0's l1: 0.0719194
[23]	valid_0's l1: 0.0719138
[24]	valid_0's l1: 0.071925
[25]	valid_0's l1: 0.0719307
[26]	valid_0's l1: 0.0719316
[27]	valid_0's l1: 0.0719207
[28]	valid_0's l1: 0.071923
[29]	valid_0's l1: 0.0718999
[30]	valid_0's l1: 0.0719205
[31]	valid_0's l1: 0.0719045
[32]	valid_0's l1: 0.0718837
[33]	valid_0's l1: 0.0718932
[34

In [None]:
#del d_train, d_valid; gc.collect()
#del x_train, x_valid; gc.collect()

print("Prepare for the prediction ...")
sample = pd.read_csv('C:\\Kaggle\\Zillow\\input\\sample_submission.csv')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(properties, on='parcelid', how='left')
df_test = prepare_additional_feature(df_test)
#del sample, prop; gc.collect()
x_test = df_test[train_columns]
del df_test; gc.collect()
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
x_test = x_test.values.astype(np.float32, copy=False)

print("Start prediction ...")
# num_threads > 1 will predict very slow in kernal
gbm.reset_parameter({"num_threads":1})
p_test = gbm.predict(x_test)

del x_test; gc.collect()

print("Start write result ...")
sub = pd.read_csv('C:\\Kaggle\\Zillow\\input\\sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

sub.to_csv('lgb_starter.csv.gz', index=False, float_format='%.4f', compression = 'gzip')
print('done.')

Prepare for the prediction ...
Start prediction ...
Start write result ...


In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc

print('Loading data ...')

train = pd.read_csv('C:\\Kaggle\\Zillow\\input\\train_2016.csv')
prop = pd.read_csv('C:\\Kaggle\\Zillow\\input\\properties_2016.csv')

for c, dtype in zip(prop.columns, prop.dtypes):	
    if dtype == np.float64:		
        prop[c] = prop[c].astype(np.float32)

df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()

split = 90000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]
x_train = x_train.values.astype(np.float32, copy=False)
x_valid = x_valid.values.astype(np.float32, copy=False)

d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)

params = {}
params['learning_rate'] = 0.002
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'mae'
params['sub_feature'] = 0.5
params['num_leaves'] = 60
params['min_data'] = 500
params['min_hessian'] = 1

watchlist = [d_valid]
clf = lgb.train(params, d_train, 500, watchlist)

del d_train, d_valid; gc.collect()
del x_train, x_valid; gc.collect()

print("Prepare for the prediction ...")
sample = pd.read_csv('C:\\Kaggle\\Zillow\\input\\sample_submission.csv')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')
del sample, prop; gc.collect()
x_test = df_test[train_columns]
del df_test; gc.collect()
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
x_test = x_test.values.astype(np.float32, copy=False)

print("Start prediction ...")
# num_threads > 1 will predict very slow in kernal
clf.reset_parameter({"num_threads":1})
p_test = clf.predict(x_test)

del x_test; gc.collect()

print("Start write result ...")
sub = pd.read_csv('C:\\Kaggle\\Zillow\\input\\sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

sub.to_csv('lgb_starter.csv.gz', index=False, float_format='%.4f', compression = 'gzip')

In [None]:
split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]
y_train[y_train>0]=1
y_train[y_train<=0]=0
y_valid[y_valid>0]=1
y_valid[y_valid<=0]=0

print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, x_valid; gc.collect()

print('Training ...')

params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = ['error','logloss']
params['max_depth'] = 8
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

del d_train, d_valid


In [None]:

print('Building test set ...')

sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')

del prop; gc.collect()

x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

del df_test, sample; gc.collect()

d_test = xgb.DMatrix(x_test)

del x_test; gc.collect()

print('Predicting on test ...')

p_test = clf.predict(d_test)

del d_test; gc.collect()

sub = pd.read_csv('../input/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

print('Writing csv ...')
sub.to_csv('xgb_starter.csv', index=False, float_format='%.4f') # Thanks to @inversion