In [None]:
import numpy as np
import pandas as pd


In [None]:
%reload_kedro

In [None]:
df = catalog.load('model_input')

In [None]:
X_train = df.loc[df.split == 'train'].drop(columns=['split','price']).reset_index(drop=True)
X_test = df.loc[df.split == 'test'].drop(columns=['split','price']).reset_index(drop=True)
X_valid = df.loc[df.split == 'valid'].drop(columns=['split','price']).reset_index(drop=True)

y_train = df.loc[df.split == 'train'].price.reset_index(drop=True)
y_test = df.loc[df.split == 'test'].price.reset_index(drop=True)
y_valid = df.loc[df.split == 'valid'].price.reset_index(drop=True)

In [None]:
categorical = ['producer_name','market','building_type','building_material','property_form','offeror',
              'GC_addr_suburb','GC_addr_postcode']
numerical = ['flat_size', 'rooms', 'floor', 'number_of_floors', 'year_of_building',
             'GC_latitude', 'GC_longitude', 'price_median_08w']
numerical_add = ['price_median_01w', 'price_median_02w',
             'price_median_03w', 'price_median_04w',
             'price_median_12w', 'price_mean_01w', 'price_mean_02w',
             'price_mean_03w', 'price_mean_04w', 'price_mean_08w', 'price_mean_12w']

text = ['location','description','name','additional_info']

In [None]:
X_train.select_dtypes('number').columns

In [None]:
X_train.head(1).T

## Text features

### description

In [None]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import lightgbm as lgb
import re

import category_encoders as ce


In [None]:
from sklearn.compose import TransformedTargetRegressor
import numpy as np

In [None]:
text='wyjatkowa oferta gotowych i wykonczonych apartamentow zapraszam serdecznie do ogladania ! nowoczesnie urzadzone apartamenty o powierzchni od 21m2 do 55 m2 poozone w zachodniej czesci warszawy, przy al. jerozolimskich. mniejsze apartamenty skadaja sie z salonu z aneksem kuchennym i azienki, wieksze posiadaja dodatkowa sypialnie i druga azienke. niezwykle atrakcyjnie wykonczone, blisko miedzynarodowego lotniska im. fryderyka chopina i dworca kolejowego, doskonale skomunikowane z centrum warszawy i drogami wyjazdowymi. posiadam rowniez inne ukady mieszkan w tej oraz sasiednich inwestycjach oferta 1-2-3-4-5-6 pokoi o metrazach 15 - 190 m2. w cenach juz od 200 000 z - idealne rozwiazania dla inwestorow zapraszam po szczegoowe informacje - tel 501-920-939'


reg = r'[A-Za-z]\w{2,}'
r1 = re.findall(reg,text)
r1[:10]

In [None]:
from unidecode import unidecode


def preProcess(s):
    return unidecode(s).lower()

pipe = make_pipeline(
    ColumnTransformer([
        ('txt_description', TfidfVectorizer(
            lowercase=True,
            ngram_range=(1, 3),
            stop_words = ['ale','oraz','lub','sie','and','the','jest','do','od'],
            max_features=1000,
            token_pattern=r'[A-Za-z]\w{2,}',
            preprocessor = preProcess,
            dtype=np.float32,
            use_idf=True,
        ), 'description'),
    ]),
    lgb.LGBMRegressor(objective='regression_l2', random_state=666)
    
)

In [None]:
%%time
pipe.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
from sklearn.metrics import r2_score, median_absolute_error
y_pred = pipe.predict(X_train)

r2 = r2_score(y_train, y_pred)
med_abs_err = median_absolute_error(y_train, y_pred)
mean_abs_err = mean_absolute_error(y_train, y_pred)
print("Train set r2 score {}, median absolute error {}, "
      "mean absolute error {}".format(round(r2, 4), int(med_abs_err),
                                      int(mean_abs_err)))

In [None]:
from eli5 import show_weights, explain_weights_lightgbm
explain_weights_lightgbm(pipe.named_steps['lgbmregressor'], 
             feature_names = pipe.named_steps['columntransformer'].get_feature_names(),
             top=50,)

In [None]:
import eli5

best_pred = X_train.loc[np.abs(y_train - y_pred) < 1000, 'description']
eli5.explain_prediction(pipe.named_steps['lgbmregressor'],
                        best_pred.iloc[1], 
                        vec=pipe.named_steps['columntransformer'].named_transformers_['txt_description'], 
                        top = 10)

In [None]:
import eli5

best_pred = X_train.loc[np.abs(y_train - y_pred) < 1000, 'description']
eli5.explain_prediction(pipe.named_steps['lgbmregressor'],
                        best_pred.iloc[7], 
                        vec=pipe.named_steps['columntransformer'].named_transformers_['txt_description'], 
                        top = 20)

### name

In [None]:
from unidecode import unidecode


def preProcess(s):
    return unidecode(s).lower()

pipe = make_pipeline(
    ColumnTransformer([
        ('txt_name', TfidfVectorizer(lowercase=True, 
                               ngram_range=(1,1), 
                               stop_words = ['ale','oraz','lub','sie','and','the','jest','do','od'],
                               max_features=500,
                               token_pattern=r'[A-Za-z]\w{2,}',                               
                               dtype=np.float32,
                               binary = True, 
                               preprocessor=preProcess,
                               use_idf=False), 'name'),
    ]),
    lgb.LGBMRegressor(objective='regression_l2', random_state=666)
    
)

In [None]:
%%time
pipe.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
from sklearn.metrics import r2_score, median_absolute_error
y_pred = pipe.predict(X_train)

r2 = r2_score(y_train, y_pred)
med_abs_err = median_absolute_error(y_train, y_pred)
mean_abs_err = mean_absolute_error(y_train, y_pred)
print("Train set r2 score {}, median absolute error {}, "
      "mean absolute error {}".format(round(r2, 4), int(med_abs_err),
                                      int(mean_abs_err)))

In [None]:
from eli5 import show_weights, explain_weights_lightgbm
explain_weights_lightgbm(pipe.named_steps['lgbmregressor'], 
             feature_names = pipe.named_steps['columntransformer'].get_feature_names(),
             top=50,)

## PIPELINE

In [None]:
from unidecode import unidecode

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import category_encoders as ce

import lightgbm as lgb

cols_ce_oh = ['producer_name', 'market', 'building_type', 'building_material', 'property_form', 'offeror']
cols_ce_te = ['GC_addr_suburb', 'GC_addr_postcode']
cols_numeric = ['flat_size', 'rooms', 'floor', 'number_of_floors', 'year_of_building','GC_latitude', 
                'GC_longitude']

cols_prices_in_neighbourhood = ['price_median_03w', 'price_median_08w','price_median_12w', 
                                'price_mean_03w', 'price_mean_08w', 'price_mean_12w']

stop_words = ['ale','oraz','lub','sie','and','the','jest','do','od','with','mozna']

token_pattern=r'[A-Za-z]\w{2,}'

def preProcess(s):
    return unidecode(s).lower()

pipe = make_pipeline(
    ColumnTransformer([
        ('ce_oh',ce.OneHotEncoder(return_df=True, use_cat_names=True),cols_ce_oh),
        ('ce_GC',ce.TargetEncoder(return_df=True),cols_ce_te),
        ('numeric','passthrough',cols_numeric+cols_prices_in_neighbourhood),
        ('txt_description', TfidfVectorizer(lowercase=True,
                                            ngram_range=(1, 3),
                                            stop_words = stop_words,
                                            max_features=1000,
                                            token_pattern=token_pattern,
                                            preprocessor = preProcess,
                                            dtype=np.float32,
                                            use_idf=True,
                                           ), 'description'),
        ('txt_name', TfidfVectorizer(lowercase=True,
                                     ngram_range=(1,1), 
                                     stop_words = stop_words,
                                     max_features=500,
                                     token_pattern=token_pattern,                               
                                     dtype=np.float32,
                                     binary = True, 
                                     preprocessor=preProcess,
                                     use_idf=False
                                    ), 'name'),
    ]),
    
)


In [None]:
%%time
X_train_transformed = pipe.fit_transform(X_train, y_train)

In [None]:
type(X_train_transformed)

In [None]:
with open('X_train_transformed.npy', 'wb') as f:
    np.save(f, X_train_transformed, allow_pickle=True)


In [None]:
with open('X_train_transformed.npy', 'rb') as f:
    x=np.load(f, allow_pickle=True)


In [None]:
lgb.LGBMRegressor(objective='regression_l2', random_state=666)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
from sklearn.metrics import r2_score, median_absolute_error
y_pred = pipe.predict(X_train)

r2 = r2_score(y_train, y_pred)
med_abs_err = median_absolute_error(y_train, y_pred)
mean_abs_err = mean_absolute_error(y_train, y_pred)
print("Train set r2 score {}, median absolute error {}, "
      "mean absolute error {}".format(round(r2, 4), int(med_abs_err),
                                      int(mean_abs_err)))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
from sklearn.metrics import r2_score, median_absolute_error
y_pred_test = pipe.predict(X_test)

r2 = r2_score(y_test, y_pred_test)
med_abs_err = median_absolute_error(y_test, y_pred_test)
mean_abs_err = mean_absolute_error(y_test, y_pred_test)
print("Train set r2 score {}, median absolute error {}, "
      "mean absolute error {}".format(round(r2, 4), int(med_abs_err),
                                      int(mean_abs_err)))

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
%matplotlib inline

print('Plotting feature importances...')
def names(): return pipe.named_steps['columntransformer'].get_feature_names()
pipe.named_steps['lgbmregressor'].booster_.feature_name = names
fig, ax = plt.subplots(figsize=(10,8))
fig.subplots_adjust(left=0.4)
lgb.plot_importance(pipe.named_steps['lgbmregressor'], max_num_features=35, ax=ax, importance_type = 'split')
plt.yticks(fontsize=10)
plt.show()

## XGboost

In [None]:
import lightgbm as lgb


In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [None]:
dict_cat = dict()
for cat in categorical:
    dict_cat[cat] = preprocessing.LabelEncoder()
    X_train.loc[0, cat] = 'n/a'
    dict_cat[cat].fit(X_train[cat])
    X_train[cat] = dict_cat[cat].transform(X_train[cat])
    X_test[cat] = [i if i in list(dict_cat[cat].classes_) else 'n/a' for i in X_test[cat]]
    X_test[cat] = dict_cat[cat].transform(X_test[cat])
    X_valid[cat] = [i if i in list(dict_cat[cat].classes_) else 'n/a' for i in X_valid[cat]]
    X_valid[cat] = dict_cat[cat].transform(X_valid[cat])


In [None]:
train_data = lgb.Dataset(
    X_train[categorical+numerical],
    label=y_train,
    categorical_feature=categorical,
    free_raw_data=False
            )

test_data = lgb.Dataset(
    X_test[categorical+numerical],
    label=y_test,
    reference = train_data,
    categorical_feature=categorical,
    free_raw_data=False
            )

validation_data = lgb.Dataset(
    X_valid[categorical+numerical],
    label=y_valid,
    categorical_feature=categorical,
    free_raw_data=False
            )

In [None]:
FIXED_PARAMS={'objective': 'regression',
              'metric': 'regression_l1',
              'boosting':'gbdt',
              'num_boost_round':100,
              'metric':['mape','l1'],
              'early_stopping_rounds':20}

SEARCH_PARAMS = {'learning_rate': 0.05,
                'max_depth': 15,
                'num_leaves': 100,
                'feature_fraction': 0.8,
                'subsample': 0.9}

In [None]:
FIXED_PARAMS.update(SEARCH_PARAMS)

In [None]:
bst = lgb.train(FIXED_PARAMS, train_data, valid_sets=[test_data])

In [None]:
bst.best_score['valid_0']['mape']
bst.best_score['valid_0']['l1']

In [None]:
train_pred = bst.predict(X_train[categorical+numerical])

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8,6))

lgb.plot_importance(bst, ax=ax)

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_train, train_pred)

In [None]:
mean_absolute_error(y_test,  bst.predict(X_test[categorical+numerical]))

In [None]:
mean_absolute_error(y_valid,  bst.predict(X_valid[categorical+numerical]))