In [1]:
import feather
import sys
sys.path.append("..")
import helpers

In [2]:
df_train = feather.read_dataframe("data/df_train.feather")
df_test = feather.read_dataframe("data/df_test.feather")
df_oot = feather.read_dataframe("data/df_oot.feather")

In [3]:
import numpy as np
from category_encoders import *

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error

import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [4]:
outliers = [
    {"key":"price", "min":0, "max":1500000},
    {"key":"flat_size","min":6, "max":216},
    {"key":"year_of_building","min":1850, "max":2050},    
    {"key":"GC_longitude","min":20.5,"max":21.5},
    {"key":"GC_latitude","min":51,"max":52.5}
]

In [5]:
for val in outliers:
    print("{key} != {key} or ({key}>{min} and {key}<{max})".format(**val))
    df_train=df_train.query("{key} != {key} or ({key}>{min} and {key}<{max})".format(**val))

price != price or (price>0 and price<1500000)
flat_size != flat_size or (flat_size>6 and flat_size<216)
year_of_building != year_of_building or (year_of_building>1850 and year_of_building<2050)
GC_longitude != GC_longitude or (GC_longitude>20.5 and GC_longitude<21.5)
GC_latitude != GC_latitude or (GC_latitude>51 and GC_latitude<52.5)


In [6]:
"""
np.random.seed(666)


pipe = make_pipeline(
    ColumnTransformer([
        ('cat', helpers.transformColList(),["producer_name","GC_addr_suburb"]),
        ('num', helpers.PassThroughOrReplace(),["flat_size","rooms","floor","number_of_floors","year_of_building"]),
        ('num_G', helpers.PassThroughOrReplace(),["GC_latitude","GC_longitude"]),
        #('te_G', CatBoostEncoder(), 'GC_addr_postcode'),
        ('txt_dscr', TfidfVectorizer(lowercase=True, 
                               ngram_range=(1, 3), 
                               max_features=3000, 
                               dtype=np.float32,
                               use_idf=True),'description'),
        ('txt_i', TfidfVectorizer(lowercase=True, 
                               ngram_range=(1, 3), 
                               max_features=3000, 
                               dtype=np.float32,
                               binary = True,
                               use_idf=False),'info'),
    ]),
    
    )
"""

'\nnp.random.seed(666)\n\n\npipe = make_pipeline(\n    ColumnTransformer([\n        (\'cat\', helpers.transformColList(),["producer_name","GC_addr_suburb"]),\n        (\'num\', helpers.PassThroughOrReplace(),["flat_size","rooms","floor","number_of_floors","year_of_building"]),\n        (\'num_G\', helpers.PassThroughOrReplace(),["GC_latitude","GC_longitude"]),\n        #(\'te_G\', CatBoostEncoder(), \'GC_addr_postcode\'),\n        (\'txt_dscr\', TfidfVectorizer(lowercase=True, \n                               ngram_range=(1, 3), \n                               max_features=3000, \n                               dtype=np.float32,\n                               use_idf=True),\'description\'),\n        (\'txt_i\', TfidfVectorizer(lowercase=True, \n                               ngram_range=(1, 3), \n                               max_features=3000, \n                               dtype=np.float32,\n                               binary = True,\n                               use_idf=Fa

In [14]:
flds_id = ['_id',]
flds_target = ['price']
flds_num = ['flat_size','rooms','floor','number_of_floors','year_of_building']
flds_num_geo = ['GC_latitude','GC_longitude']
flds_cat = ['producer_name']
flds_cat_geo =['GC_addr_road','GC_addr_neighbourhood','GC_addr_suburb','GC_addr_city','GC_addr_state','GC_addr_postcode','GC_addr_country']
flds_text = ['description']
flds_drop = ['location']
download_date = ['download_date','name']
all_fields = flds_id+flds_target+flds_num+flds_num_geo+flds_cat+flds_cat_geo+flds_text+download_date+flds_drop

np.random.seed(666)

pipe = make_pipeline(
    ColumnTransformer([
        ('cat', helpers.transformColList(),flds_cat+flds_cat_geo),
        ('num', helpers.PassThroughOrReplace(),flds_num+flds_num_geo),
        ('te_G', TargetEncoder(), 'GC_addr_postcode'),
        ('txt_name', TfidfVectorizer(lowercase=True, 
                               ngram_range=(1,1), 
                               max_features=500, 
                               dtype=np.float32,
                               binary = True, 
                               use_idf=False),'name'),
        ('txt_dscr', TfidfVectorizer(lowercase=True, 
                               ngram_range=(1, 3), 
                               max_features=3000, 
                               dtype=np.float32,
                               use_idf=True),'description'),
    ]),
    )

In [15]:
%%time
train_transform=pipe.fit_transform(df_train, df_train.price)

CPU times: user 2min 2s, sys: 13 s, total: 2min 15s
Wall time: 2min 41s


In [16]:
%%time
test_transform=pipe.transform(df_test)
oot_transform=pipe.transform(df_oot)

CPU times: user 26.7 s, sys: 1.85 s, total: 28.6 s
Wall time: 34.6 s


In [17]:
params = {'colsample_bytree': 0.6624318354159208, #feature_fraction
 'learning_rate': 0.15990411055449805,
 'max_bin': 38712,
 'max_depth': 7,
 'min_child_samples': 84, #min_data_in_leaf
 'min_child_weight': 6, #min_sum_hessian_in_leaf
 'n_estimators': 206, #num_iterations
 'num_leaves': 159,
 'reg_alpha': 3.960667919705787e-06, #lambda_l1
 'reg_lambda': 499.85995495490215, #lambda_l2
 'subsample': 0.9022680042341511, #bagging_fraction
 'subsample_for_bin': 144116, #bin_construct_sample_cnt
 'subsample_freq': 0 #bagging_freq
         } 
model = lgb.LGBMRegressor(**params, objective='regression_l2', random_state=666)

In [None]:
%%time
y_log = np.log1p( df_train.price.tolist())
model.fit(train_transform, y_log)

In [None]:
%%time

y_pred = model.predict(train_transform)
y_pred = np.expm1(y_pred)

r2 = r2_score(df_train.price, y_pred)
med_abs_err = median_absolute_error(df_train.price, y_pred)
mean_abs_err = mean_absolute_error(df_train.price, y_pred)
print("Train set r2 score {}, median absolute error {}, "
      "mean absolute error {}".format(round(r2, 4), int(med_abs_err),
                                      int(mean_abs_err)))

y_pred = model.predict(test_transform)
y_pred = np.expm1(y_pred)

r2 = r2_score(df_test.price, y_pred)
med_abs_err = median_absolute_error(df_test.price, y_pred)
mean_abs_err = mean_absolute_error(df_test.price, y_pred)

print("Test set r2 score {}, median absolute error {}, mean absolute error {}".format(
    round(r2, 4), int(med_abs_err), int(mean_abs_err)))

y_pred = model.predict(oot_transform)
y_pred = np.expm1(y_pred)

r2 = r2_score(df_oot.price, y_pred)
med_abs_err = median_absolute_error(df_oot.price, y_pred)
mean_abs_err = mean_absolute_error(df_oot.price, y_pred)

print("Out of time set  r2 score {}, median absolute error {}, mean absolute error {}".format(
    round(r2, 4), int(med_abs_err), int(mean_abs_err)))


mTrain set r2 score 0.8974, median absolute error 34937, mean absolute error 56433
Test set r2 score 0.4974, median absolute error 42528, mean absolute error 142758
Out of time set  r2 score 0.379, median absolute error 43568, mean absolute error 165643
CPU times: user 2.75 s, sys: 507 ms, total: 3.26 s
Wall time: 1.1 s