In [0]:
!pip install --upgrade tables
!pip install eli5
!pip install xgboost
!pip install hyperopt

In [0]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix"

In [0]:
import pandas as pd
import numpy as np

#from sklearn.dummy import DummyRegressor
#from sklearn.tree import DecisionTreeRegressor
#from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

from hyperopt import hp, fmin, tpe, STATUS_OK, STATUS_FAIL

import eli5
from eli5.sklearn import PermutationImportance

In [5]:
# Data read
df = pd.read_hdf("data/car.h5")
df.shape

(106494, 155)

Feature engineering

In [0]:
# factorize all features
SUFFIX_CAT = "__cat"

for feat in df.columns:
  if isinstance(df[feat][0], list): continue
    
  factorized_values = df[feat].factorize()[0]
  if SUFFIX_CAT in feat:
    df[feat] = factorized_values
  else:
    df[feat + SUFFIX_CAT] = factorized_values

In [7]:
cat_feats = [x for x in df.columns if SUFFIX_CAT in x]
cat_feats = [x for x in cat_feats if "price" not in x]
len(cat_feats)

151

In [0]:
# previously identified top 20 features
feats = ['param_napęd__cat',
 'param_rok-produkcji__cat',
 'param_stan__cat',
 'param_skrzynia-biegów__cat',
 'param_faktura-vat__cat',
 'param_moc__cat',
 'param_marka-pojazdu__cat',
 'feature_kamera-cofania__cat',
 'param_typ__cat',
 'param_pojemność-skokowa__cat',
 'seller_name__cat',
 'feature_wspomaganie-kierownicy__cat',
 'param_model-pojazdu__cat',
 'param_wersja__cat',
 'param_kod-silnika__cat',
 'feature_system-start-stop__cat',
 'feature_asystent-pasa-ruchu__cat',
 'feature_czujniki-parkowania-przednie__cat',
 'feature_łopatki-zmiany-biegów__cat',
 'feature_regulowane-zawieszenie__cat']

In [0]:
# param_rok-produkcji
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) == "None" else int(x))
feats[1] = 'param_rok-produkcji'

In [0]:
# param_moc '1 116 KM'
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x) == "None" else int(x.replace(" ", "").strip("KM")))
feats[5] = 'param_moc'

In [0]:
# param_pojemność-skokowa '1 560 cm3'
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == "None" else int(x.split("cm3")[0].replace(" ", "")))
feats[9] = 'param_pojemność-skokowa'

Run model function definition

In [0]:
def run_model (model, feats):
  X = df[feats].values
  y = df['price_value'].values

  scores = cross_val_score(model, X, y, cv=3, scoring="neg_mean_absolute_error")

  return np.mean(scores), np.std(scores)

Lets run model

In [15]:
xgb_params = {
    "max_depth": 5,
    "n_estimators": 50,
    "learning_rate": 0.1,
    "seed": 0
}

model = xgb.XGBRegressor(**xgb_params)

run_model(model, feats)



(-9621.119663721702, 100.59307679134791)

Parameter optimization


In [0]:
# one approach: grid search - resource consuming
# other hyperopt - Bayesian optimization
def obj_func(params):
  print("Training with params:")
  print(params)
  try:
    mean_mae, score_std = run_model(xgb.XGBRegressor(**params), feats)
    return{'loss': np.abs(mean_mae), 'status': STATUS_OK}
  except:
    return{'loss': np.abs(mean_mae), 'status': STATUS_FAIL}

In [0]:
# parameter space
xgb_reg_params = {
    "learning_rate": hp.choice("learning_rate", np.arange(0.05, 0.31, 0.05)),
    "max_depth": hp.choice("max_depth", np.arange(5, 16, 1, dtype=int)),
    "subsample": hp.quniform("subsample", 0.5, 1, 0.05),
    "colsample_bytree": hp.quniform("colsample_bytree", 0.5, 1, 0.05),
    "objective": 'reg:squarederror',
    "n_estimators": 100,
    "seed": 0
}

In [30]:
# run
best = fmin(obj_func, xgb_reg_params, algo=tpe.suggest, max_evals=3) # only 3 for quick finish

Training with params:
{'colsample_bytree': 0.8, 'learning_rate': 0.25, 'max_depth': 11, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.55}
Training with params:
{'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.7000000000000001}
Training with params:
{'colsample_bytree': 0.65, 'learning_rate': 0.15000000000000002, 'max_depth': 13, 'n_estimators': 100, 'objective': 'reg:squarederror', 'seed': 0, 'subsample': 0.9}
100%|██████████| 3/3 [02:40<00:00, 59.07s/it, best loss: 7811.662262978122]
