In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.externals import joblib
from sklearn import metrics
import lightgbm as lgb

sys.path.append('../')
from pipelines import main
import pipeline_config as cfg

EXPERIMENT_DIR = 'YOUR/PATH/TO/EXPERIMETS'

# Solution 2

Local CV: **0.2197**
Public LB: **0.2248**

In [None]:
main(cfg.SOLUTION_CONFIG, train_mode=False)

## Feature Extraction

1. Dataframe features (train.csv)

    * numerical features - for now just the log(price) and item_seq

    * target encoded features - likelihood encoding for categorical features

    * groupby aggregations - features calculated based on aggregated categories

2. Text Features

    * Hand crafted features like len vs word count

    * Word overlap between columns like `category_name` name and `descritpion`

    * TFIDF on description and title

3. Image Features

    * Image statistics like histogram, mean var over channels, blur etc


## Models
* lightgbm with the following parameters

```yaml
  lgbm_random_search_runs: 0
  lgbm__boosting_type: 'gbdt'
  lgbm__objective: regression
  lgbm__metric: RMSE
  lgbm__number_boosting_rounds: 10000
  lgbm__early_stopping_rounds: 50
  lgbm__learning_rate: 0.2
  lgbm__num_leaves: 90
  lgbm__max_depth: 27
  lgbm__min_child_samples: 20
  lgbm__max_bin: 1300
  lgbm__subsample: 0.8
  lgbm__subsample_freq: 1
  lgbm__colsample_bytree: 0.7
  lgbm__min_child_weight: 12
  lgbm__reg_lambda: 6.0
  lgbm__reg_alpha: 3.4e-05
  lgbm__scale_pos_weight: 1

```

## Result Exploration

In [None]:
prediction_filepath = os.path.join(EXPERIMENT_DIR, 'solution_2', 'evaluation_predictions.csv')
evaluation_predictions =  pd.read_csv(prediction_filepath)

In [None]:
prediction_filepath = os.path.join(EXPERIMENT_DIR, 'solution_2', 'evaluation_predictions.csv')
evaluation_predictions =  pd.read_csv(prediction_filepath)

In [None]:
worst_predictions.head(3)

## Model exploration

In [None]:
model_filepath = os.path.join(EXPERIMENT_DIR, 'solution_2', 'transformers','light_gbm')
light_gbm_model = joblib.load(model_filepath)

### Learning Curve

In [None]:
ax = lgb.plot_metric(light_gbm_model['evals_result'], metric='rmse')
plt.show()

### Feature Importance

In [None]:
fig, ax = plt.subplots(1,1,figsize=(16,10))
lgb.plot_importance(light_gbm_model['estimator'], max_num_features=20, ax=ax)
plt.show()