In [1]:
%load_ext autoreload
%autoreload 2

import helpers


In [2]:
MONGO_ADDRESS = 'localhost'
MONGO_PORT = 27017
MONGO_DBNAME = 'OFFERS'
MONGO_COLL_OTODOM = 'Otodom'
MONGO_USERNAME = 'xflats'
MONGO_PASSWORD = 'xflats'

In [3]:
db = helpers.FilesMongo.set_connection(MONGO_ADDRESS, MONGO_PORT, MONGO_DBNAME, MONGO_COLL_OTODOM, MONGO_USERNAME, MONGO_PASSWORD)
list_mongo = helpers.FilesMongo.list_files(db)


In [4]:
target = ['price']

col_numeric = ['flat_size', 'floor_attic','floor_basement', 'rooms', 'floor', 'rent_price',
               'number_of_floors','year_of_building','GC_latitude','GC_longitude']

col_cat = ['additional_info','market' ,'building_type', 'building_material', 'widows_type',
          'heating_type', 'finishing_stage', 'property_form','GC_addr_postcode']

col_text = ['name', 'location','description']

OUT=['GC_boundingbox', 'GC_addr_house_number', 'GC_addr_road','GC_addr_neighbourhood', 'GC_addr_suburb',
       'GC_addr_city','GC_addr_county', 'GC_addr_state', 'GC_addr_postcode',
       'GC_addr_country', 'GC_addr_country_code', 'url', 'main_url']

In [5]:
select = {}
for i in target+col_numeric+col_cat+col_text:
    select[i]=1


In [6]:
files = [i for i in db.find({},select)]

In [7]:
import numpy as np
import pandas as pd

np.random.seed(666)

df = pd.DataFrame(files)
df['fold']=np.random.choice(['train', 'test'],df.shape[0],p=[0.9, 0.1]).tolist()

filter_query = "price <= 1000000 and price >= 10000 and flat_size <= 130 and property_form != 'udział'"
df_train = df[df['fold']=='train'].dropna(subset = ['price']).query(filter_query).reset_index(drop=True)
df_test = df[df['fold']=='test'].dropna(subset = ['price']).query(filter_query).reset_index(drop=True)


In [8]:
params = {'colsample_bytree': 0.6624318354159208, #feature_fraction
 'learning_rate': 0.15990411055449805,
 'max_bin': 38712,
 'max_depth': 7,
 'min_child_samples': 84, #min_data_in_leaf
 'min_child_weight': 6, #min_sum_hessian_in_leaf
 'n_estimators': 206, #num_iterations
 'num_leaves': 159,
 'reg_alpha': 3.960667919705787e-06, #lambda_l1
 'reg_lambda': 499.85995495490215, #lambda_l2
 'subsample': 0.9022680042341511, #bagging_fraction
 'subsample_for_bin': 144116, #bin_construct_sample_cnt
 'subsample_freq': 0 #bagging_freq
         } 

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error

import lightgbm as lgb

In [11]:
np.random.seed(666)

pipe_lgb = make_pipeline(
    ColumnTransformer([
        ('cat', helpers.transformColList(),col_cat),
        ('num', helpers.PassThroughOrReplace(),col_numeric),
        ('txt_dscr', TfidfVectorizer(lowercase=True, 
                               ngram_range=(1, 3), 
                               max_features=3000, 
                               dtype=np.float32,
                               use_idf=True),'description'),
        ('txt_loc', TfidfVectorizer(lowercase=True, 
                               ngram_range=(1,1), 
                               max_features=500, 
                               dtype=np.float32,
                               binary = True, 
                               use_idf=False),'location'),
        ('txt_name', TfidfVectorizer(lowercase=True, 
                               ngram_range=(1,1), 
                               max_features=500, 
                               dtype=np.float32,
                               binary = True, 
                               use_idf=False),'name')
    ]),
    lgb.LGBMRegressor(**params, objective='regression_l2', random_state=666)
    )

In [12]:
%%time
y_log = np.log1p( df_train.price.tolist())
save = pipe_lgb.fit(df_train, y_log)

CPU times: user 2min 13s, sys: 8.95 s, total: 2min 22s
Wall time: 2min 11s


In [13]:
#first pipe
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error, mean_squared_log_error

y_pred = pipe_lgb.predict(df_train)
y_pred = np.expm1(y_pred)

r2 = r2_score(df_train.price, y_pred)
med_abs_err = median_absolute_error(df_train.price, y_pred)
mean_abs_err = mean_absolute_error(df_train.price, y_pred)
print("Train r2 score {}, median absolute error {}, mean absolute error {}".format(round(r2,4),int(med_abs_err), int(mean_abs_err)))

y_pred = pipe_lgb.predict(df_test)
y_pred = np.expm1(y_pred)

r2 = r2_score(df_test.price, y_pred)
med_abs_err = median_absolute_error(df_test.price, y_pred)
mean_abs_err = mean_absolute_error(df_test.price, y_pred)

print("Test r2 score {}, median absolute error {}, mean absolute error {}".format(round(r2,4),int(med_abs_err), int(mean_abs_err)))


Train r2 score 0.9222, median absolute error 23580, mean absolute error 35463
Test r2 score 0.8987, median absolute error 28108, mean absolute error 40922


In [14]:
import pickle

pickle.dump(pipe_lgb, open('model.pkl','wb'))