**Table of contents**<a id='toc0_'></a>    
- [Process inputs](#toc1_)    
- [Read and Prep data](#toc2_)    
- [Train model limiting no interaction](#toc3_)    
- [Predict and evaluate](#toc4_)    
- [Train model 2nd layer to just find interactions](#toc5_)    
- [Evaluate](#toc6_)    
- [Train 3 model and evaluate](#toc7_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [None]:
import pandas as pd
import lightgbm as lgb
from func import calculate_prediction_quality, calculate_ranking_metrics, get_feature_importance

# <a id='toc1_'></a>[Process inputs](#toc0_)

In [None]:
target = 'qualidade_global_media_dada_pela_equipe_'
categorical_features = ['laboratorio', 'fora_de_tipo_x_tipo_unico']
data_path = "../data/processed/df_model.parquet"

# <a id='toc2_'></a>[Read and Prep data](#toc0_)

In [None]:
data = pd.read_parquet(data_path)

# Turn categorical variables to categorical type for lightgbm split variables as categorical
for cat_var in categorical_features:
    data[cat_var] = pd.Categorical(data[cat_var])

# Isolate and remove response column from data
response = data[target].values
trn_data = data.drop(columns=[target])

# Create lgbm dataset
trn_dset = lgb.Dataset(trn_data, label=response, params={'max_bin': 5000})

# <a id='toc3_'></a>[Train model limiting no interaction](#toc0_)

In [None]:
mdl = lgb.train(params={"objective":"regression", 
                        "num_boost_round":10000, 
                        "max_depth":1, 
                        "learning_rate":1,
                        'max_bin': 5000,
                        }, train_set=trn_dset)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005365 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 876
[LightGBM] [Info] Number of data points in the train set: 626, number of used features: 28
[LightGBM] [Info] Start training from score 5.640788


# <a id='toc4_'></a>[Predict and evaluate](#toc0_)

In [None]:
prd = mdl.predict(trn_data)
calculate_prediction_quality(list(response),list(prd))

{'rmse': 0.07349840531714019,
 'mae': 0.05453529413410752,
 'r2': 0.9965613359501102,
 'gini': 0.12188122618513741}

# <a id='toc5_'></a>[Train model 2nd layer to just find interactions](#toc0_)

In [None]:
int_dset = lgb.Dataset(trn_data, label=response, init_score=prd)

int_mdl = lgb.train(params={"objective":"regression", 
                            "num_boost_round":10000, 
                            "max_depth":2, 
                            "learning_rate":1, 
                            "extra_trees" :True}, 
                            train_set=int_dset)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 876
[LightGBM] [Info] Number of data points in the train set: 626, number of used features: 28


# <a id='toc6_'></a>[Evaluate](#toc0_)

In [None]:
int_prd = int_mdl.predict(trn_data) + prd
calculate_prediction_quality(list(response),list(int_prd))

{'rmse': 0.0008243944769605448,
 'mae': 0.0004722379430525522,
 'r2': 0.9999995673825206,
 'gini': 0.12220892887011103}

# <a id='toc7_'></a>[Train 3 model and evaluate](#toc0_)

In [8]:
int3_dset = lgb.Dataset(trn_data, label=response, init_score=int_prd)

int3_mdl = lgb.train(params={"objective":"regression", 
                            "num_boost_round":10000, 
                            "max_depth":3, 
                            "learning_rate":1, 
                            "extra_trees" :True}, 
                            train_set=int3_dset)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 876
[LightGBM] [Info] Number of data points in the train set: 626, number of used features: 28


In [9]:
int3_prd = int3_mdl.predict(trn_data) + int_prd
calculate_prediction_quality(list(response),list(int3_prd))

{'rmse': 2.2246182165348786e-07,
 'mae': 1.554587074519446e-07,
 'r2': 0.9999999999999685,
 'gini': 0.12220892887011103}

In [20]:
calculate_ranking_metrics(list(response),list(int3_prd))

{'spearman_correlation': 0.9971798503354874,
 'kendall_tau': 0.9733766149714134,
 'ranking_error': 0.026523696271269482,
 'ordering_accuracy': 0.9491476038338658}

# Feature importance
## Feature importance main effects

In [24]:
main_imp = get_feature_importance(mdl,trn_data.columns)
main_imp

Unnamed: 0_level_0,gain
Feature,Unnamed: 1_level_1
_acidez_qualidade,1536
_docura,1322
_acidez_intensidade,1306
_amargor,1288
_intensidade,1191
_adstringencia,968
odor_do_po,783
_corpo,765
_frutado_,76
_queimado_defumado_,72


## Feature importance 2 way interactions

In [21]:
get_feature_importance(int_mdl,trn_data.columns)

Unnamed: 0_level_0,gain
Feature,Unnamed: 1_level_1
_intensidade,2172
odor_do_po,1942
_chocolate_cacau_,1718
_amargor,1681
_velho_oxidado_,1631
_corpo,1555
_queimado_defumado_,1547
_adstringencia,1492
_acidez_intensidade,1481
_madeira_papelao_,1469


## Feature importance 3 way interactions

In [23]:
get_feature_importance(int3_mdl,trn_data.columns)

Unnamed: 0_level_0,gain
Feature,Unnamed: 1_level_1
_madeira_papelao_,4285
_chocolate_cacau_,4081
_velho_oxidado_,3875
_queimado_defumado_,3779
odor_do_po,3545
_intensidade,3394
_adstringencia,2568
_tostado_,2466
_acidez_intensidade,2439
_amargor,2399


# Model Iterrogation

In [None]:
for r in main_imp.itertuples():
    plo

1536
1322
1306
1288
1191
968
783
765
76
72
69
63
63
62
58
56
47
42
41
40
33
32
24
21
18
13
6
5
0
0
0
0
0
0


# Interaction analysis

In [None]:
from shap import TreeExplainer

exp = TreeExplainer(int_mdl)
shap = exp.shap_values(trn_data)

from shap.utils import approximate_interactions

int_order = {}
for col in trn_data.columns:
   int_order[col] = approximate_interactions(index=col, shap_values=shap, X=trn_data, feature_names=trn_data.columns)

"\nfeature_name = 'example_feature'\nplot = create_shap_density_plot(\n    feature_values=df[feature_name],\n    shap_values=shap_values[feature_name],\n    feature_name=feature_name\n)\nplt.show()\n"

In [56]:
shap_df = pd.DataFrame(shap, columns=trn_data.columns)
shap_df

Unnamed: 0,laboratorio,fora_de_tipo_x_tipo_unico,odor_do_po,_docura,_corpo,_acidez_intensidade,_acidez_qualidade,_amargor,_adstringencia,_intensidade,...,_frutado_,_iodoformio_quimico_,_madeira_papelao_,_mel_,_queimado_defumado_,_terroso_mofo_,_tostado_,_vegetal_,_velho_oxidado_,_verde_herbaceo_
0,-0.026203,-0.038577,0.020818,0.031714,-0.056248,-0.001831,0.027028,0.037388,0.014677,-0.001826,...,0.003332,-0.000224,-0.013658,0.000129,-0.049725,-0.038532,-0.013097,0.0,0.000262,0.000612
1,-0.014350,-0.050825,-0.018492,0.019273,-0.094294,-0.002275,0.001779,0.032550,0.030636,0.029724,...,0.002137,0.000055,0.001583,-0.000255,-0.013049,-0.042372,0.032703,0.0,0.001496,0.006166
2,-0.034267,0.003230,-0.000839,0.066089,0.021843,0.015375,0.030223,0.004819,0.000437,0.007478,...,0.002154,-0.000942,0.026984,0.000381,-0.040708,-0.004960,-0.013453,0.0,0.010749,0.001360
3,-0.056802,0.005564,-0.014601,0.011103,0.003411,0.000294,0.047118,-0.018598,-0.007844,0.019562,...,0.002429,-0.000929,0.027602,-0.000060,-0.020164,0.009143,-0.011946,0.0,-0.000646,0.001134
4,-0.039077,0.003832,0.023650,-0.014237,-0.017920,-0.004416,-0.032110,-0.038409,-0.000145,0.000301,...,-0.000114,0.000897,0.032817,-0.001283,0.009328,0.002297,0.019823,0.0,0.010344,0.002704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,0.015409,-0.000705,-0.014828,-0.041837,-0.003677,0.003788,0.002851,0.032415,-0.004547,0.008256,...,0.013788,0.000060,0.018092,0.001138,-0.002441,-0.002786,-0.002442,0.0,0.018632,0.003728
622,0.011498,-0.002533,0.008377,-0.004954,-0.002355,-0.005196,0.028071,0.011704,-0.017997,-0.000107,...,0.012067,-0.000693,-0.039929,0.001272,-0.009526,-0.007194,-0.003283,0.0,0.016615,0.002037
623,0.010077,-0.002512,-0.002751,-0.032359,0.008948,0.004276,0.015482,-0.000765,-0.013368,0.013697,...,0.002768,0.001949,-0.043326,-0.000999,-0.003316,-0.004286,-0.010499,0.0,-0.000536,0.002210
624,0.016493,-0.003142,0.015232,-0.012476,0.022292,0.010936,0.022109,0.038317,-0.012880,0.018073,...,0.007683,0.000581,-0.017522,0.001207,-0.009032,-0.005805,0.014442,0.0,0.000553,0.000745


In [72]:
from func import create_shap_density_plot
plots = create_shap_density_plot(trn_data[name], shap_df[name], feature_name=name)



ImportError: cannot import name 'create_shap_density_plot' from 'func' (/home/01_Modelling/func.py)