# env config

In [None]:
# lightgbm training pipeline

In [None]:
import os
import sys
import shutil

username = 'guxia'
repo_dir = f'/projects/gds-focus/data/{username}/Utils/'
secret_path = f'/projects/{username}/secret/'
os.chdir(f'{repo_dir}/variable_check/')
for p in [secret_path]:
    if p in sys.path:
        continue
    sys.path.append(p)

import pandas as pd
import numpy as np

import aml.cloud_v1 as cloud

%reload_ext cloudmagics.bigquery
%config PPMagics.domain="ccg24-hrzana-gds-focus"
%config PPMagics.autolimit=0
%url -c horton
%ppauth

calculate variable feature importance of given model, as long as input variable and output model score is given

# data & hyper param

In [None]:
guanjia_model_dir = 'guanjia_model_dir_latam23'

driver_data_dir = 'data/cbp_madmen_parquet_20230601_20230630'

categorical_var_path = 'categorical.txt'

fi_save_path = 'latam23_cbp_202306_model_var_fi.csv'


In [None]:
from pyScoring import UMEModel


# load data
ume_model_path = [f for f in os.listdir(guanjia_model_dir) if '.m' in f][0]
ume_model_path = os.path.join(guanjia_model_dir, ume_model_path)

print(f'loading model from {ume_model_path}')
ume_model = UMEModel(ume_model_path)

categorical = pd.read_csv(categorical_var_path, names=['var'])['var'].to_list()
print(f'loaded categorical num: {len(categorical)}')

In [None]:
%%time
# load driver

driver_df = pd.read_parquet(driver_data_dir)

driver_df = driver_df.replace(['', 'None', 'NONE', 'NA', '-999.0', -999.0], np.nan)

driver_df = driver_df.reset_index(drop=True)

print(f'checking cbp variables, driver size: ', driver_df.shape)

# model variable fi

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split


feature_list = ume_model.inputs
print(f'total feature num: ', len(feature_list))

cate_columns = [v for v in feature_list if v in categorical]
print(f'total categorical num: ', len(cate_columns))

target_col = ume_model.outputs[0]
print(f'regression for target: {target_col}')

# lgbt params
param = {
    'boosting_type': 'gbdt',
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 5,
    'min_data_in_leaf': 5,
    'learning_rate': 0.005,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'metric': ['l1', 'l2'],
    'num_threads': 4,
    'verbose': -1,
    'num_boost_round': 100,
    'early_stopping_rounds': 50,
}

In [None]:
%%time
# convert data type
for var in feature_list:
    assert var in driver_df.columns, f'{var} not found in driver'
    
    if var not in categorical:
        driver_df[var] = driver_df[var].astype(float)
    else:
        driver_df[var] = driver_df[var].astype('category')
        
# predict model score
score = ume_model.predict_pandas(driver_df[feature_list])
if target_col in driver_df:
    driver_df = driver_df.drop(target_col, axis=1)
        
driver_df = driver_df.join(score[target_col])

driver_df[target_col] = driver_df[target_col].astype(float)

In [None]:
%%time
# train a regression model and get fi

assert target_col in driver_df.columns

x = driver_df[feature_list]
y = driver_df[target_col]
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=0)
print("train data shape", x_train.shape)

dtrain = lgb.Dataset(x_train, y_train, categorical_feature=cate_columns)
dval = lgb.Dataset(x_val, y_val, reference=dtrain, categorical_feature=cate_columns)

try:
    bst = lgb.train(param,
                    dtrain,
                    valid_sets=[dtrain, dval],
                    verbose_eval=50)
except Exception as e:
    print(f"error with model", e)
    sys.exit()



In [None]:
# get feature importance
fi = list(zip(bst.feature_name(), bst.feature_importance(importance_type='gain'), bst.feature_importance()))

fi_df = pd.DataFrame(data=fi, columns=['variable', 'gain', 'split_cnt'])
fi_df = fi_df.sort_values(by='gain', ascending=False)

fi_df.to_csv(fi_save_path, index=False)