## Feature Importance

- Check the importance of features from dataset from 4 differnt techniques - LOFO - XGB - LGBM - RandomForest
- Uses Cross-Validation for Gridsearch of varying parameter values

### Load Packages

In [None]:
import pandas as pd
import numpy as np
import sqlalchemy as sa
from sqlalchemy import create_engine as ce
from datetime import *
from tqdm import tqdm
import scipy.stats as ss
import seaborn as sns
from sklearn import *
from sklearn.inspection import permutation_importance
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.feature_selection import *
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import *
from sklearn.impute import SimpleImputer
from itertools import *
import re
import shap
import random
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb
from lofo import LOFOImportance, Dataset, plot_importance
import time

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings
warnings.filterwarnings("ignore")

### Set Parameters

In [27]:
# list of features that are needed to be checked for importance 
features = ["feature_1",
               "feature_2", 
               "feature_3", 
               "feature_4", 
               "feature_5", 
               "feature_6", 
               "feature_7"]

# list of all target metrics that migth be used for further analysis;  small letters
targets = ['main_target_metric', 
           'target_metric_1', 
           'target_metric_2']            

target = 'main_target_metric'                   # target metric that will be used by decision tree;  small letters

clip_metric = 'yes'                             # Clip metric values : 'yes' or 'no'
clip_metric_limit = [0, 3000]                   # Min, Max values to limit metric
                  

In [30]:
start_date = "'2023-05-01'"                         # Date is inclusive
end_date = "'2023-08-15'"                           # Date is inclusive

ip = 'xxx.xx.xxx.xxx'                               # IP address
port = xxxx
user = 'username'                                   # username
pass_ai = 'password'                                # password user 
db = 'dbname'                                       # name of schema

main_table = 'table_name'                           # name of table
filter = "isrelevant=1"                             # filter for table
time_column = 'calltime'                            # column name in table for calltime
prefix = 'week'                                     # For week-wise ('week') or daily ('date') stability charts for created bins      

dates_to_filter = "('2022-05-01')"                  # If some dates needs to be removed from data

# There could be many different ways NAs could be present in dataset. All the following will be converted to numpy.nan
nas_to_replace = ['NA', 'NULL', 'NUL', 'NaN', '[NA]', 'nan', 'NAN', ' ']

# unexpected_numeric_values = {'original' : 999999999.0, 'replace' : 999999.0}
# unexpected_string_values = {'original' : '999999999.0', 'replace' : '999999.0'}
# special_char = ['$', '&', '%']

In [32]:
top_features = 10                                   # Number of top features for selection
feature_importance_technique = 'LOFO'               # Technique for calculating feature importance; ['LOFO', 'RandomForest', 'XGB', 'LGBM']

### Data Fetch

In [35]:
# data is fetched through following sql query

query = "Select " + time_column + ','+','.join(features)+','+','.join(targets)+" from " + main_table + " where " + time_column + " >= " + start_date +" and " + time_column + " <= " + end_date +" and " + filter +" and and " +time_column+" not in "+dates_to_filter+" ;"
print(query)

ai_conn = ce('mysql://'+user+':'+pass_ai+'@'+ip+':'+str(port)+'/'+db)
data = pd.read_sql(query,ai_conn)

print("data fetched successfully : ", data.shape)

#### Data processing

In [38]:
# change column names to lower case
data = data.rename(columns = lambda x: x.lower())

# replace desired values with NAs
for col in data.columns:
    data[col] = data[col].replace(nas_to_replace, np.nan)
    
# Clip metric if required
if clip_metric == 'yes':
    data[target] = data[target].astype(float).clip(clip_metric_limit[0], clip_metric_limit[1])

### Feature Importance

In [48]:
# In this cell 4 different functions for 4 techniques are defined - LOFO - XGB - LGBM - RandomForest
# params defined here are arbitrary and should be changed according to the size of data and granularity desired
# SHAP technique is used to determine feature importance from grid searches of XGB and lgbm

# Cross validation is set for KFold with 4 splits. Change according to need
cv = KFold(n_splits = 4, shuffle=False)

# SHAP is a widely used tool to explain machine learning models. Here its used for XGB and lgbm
def shap_importance(model, X, y):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    return shap_values

# LOFO refers to Leave-One-Feature-Out technique of importance. 
def lofo_importance(data ,X, target):
    params = {'n_estimators':[100, 200, 300], 'learning_rate' : [0.2, 0.1, 0.01], 'num_leaves' : [50, 100, 150], 'n_jobs' : [40, 50, 60]}    
    estimator = lgb.LGBMRegressor()
    grid_search = GridSearchCV(estimator, params, cv = cv)
    grid_search.fit(X, data[target])
    dataset = Dataset(df = data, target = target, features = X.columns)
    lofo_imp = LOFOImportance(dataset, cv = cv, scoring = 'r2', model = grid_search.best_estimator_)
    importance = lofo_imp.get_importance()
#     plot_importance(importance, figsize=(12,30))

    return importance

# XGB is widely used gradient boosting technique. It requires all its feature values to be float. 
def xgb_importance(X, y):
    params = {'n_estimators':[100, 200, 300], 'learning_rate':[0.2, 0.1, 0.01], 'max_depth' : [2,5,7]}
    model = XGBRegressor()
    grid_search = GridSearchCV(model, params, cv = cv)
    grid_search.fit(X, y)
    importance = shap_importance(grid_search.best_estimator_, X, y)
    
    return importance

# LGBM is Light Gradient Boosting Method. 
def lgbm_importance(X, y):
    params = {'n_estimators':[100, 200, 300], 'learning_rate':[0.2, 0.1, 0.01], 'max_depth' : [2,5,7], 'num_leaves' : [50, 100, 150]}
    model = LGBMRegressor(importance_type = 'gain')
    grid_search = GridSearchCV(model, params, cv = cv)
    grid_search.fit(X, y)
    importance = shap_importance(grid_search.best_estimator_, X, y)
    
    return importance

# Random Forest is one of the widely used Machine Learning technique. 
def randomforest_importance(X, y):
    params = {'n_estimators':[100, 200, 300], 'max_depth' : [2,5,7], 'min_samples_split' : range(500,5000,500)}
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(rf, params, cv = cv)
    grid_search.fit(X, y)
    importances = grid_search.best_estimator_.feature_importances_
    feature_importances = pd.DataFrame({'Feature' : X.columns, 'Importance' : importances})
    feature_importances = feature_importances.sort_values('Importance', ascending = False)
    
    return feature_importances

In [114]:
# Create a copy of dataset 
final_data = data.copy()

# XGB requires all its feature values to be float datatype. The categorical features are label encoded if XGB is used. Provide a list of categoric features in cate_cols list below to be lable encoded.
cate_cols = [] # to be provided by user
# Be cautious that label encoding creates large dataset which might hinder performance. Avoid if large dataset is used or if there are many feature values

le = LabelEncoder()

for x in cate_cols:
    if x != target:
        final_data[x] = [str(v) for v in final_data[x]]
        final_data[x] = le.fit(final_data[x]).transform(final_data[x])
    
    if feature_importance_technique == 'XGB' :
        final_data[x] = final_data[x].astype('float')
    else :
        final_data[x] = final_data[x].astype('category')
final_data = final_data.fillna(0)

y = final_data[target]
drop = [target, time_column]
X = final_data.drop(drop, axis=1)

In [None]:
start_time = time.time()            # Time check to check performance

if feature_importance_technique == 'LOFO':
    feature_importance = lofo_importance(final_data, X, target)

    important_features = feature_importance['feature'].head(top_features).to_list()

# Uncomment following lines if complete list of feature importance is required
#     important_features_data = final_data[important_features] 
#     important_features_data.reset_index(inplace=True)
#     important_features_data = important_features_data.drop('index', axis=1)

elif feature_importance_technique == 'XGB':
    values = xgb_importance(X, y)
    importance = abs(values).mean(axis=0)
    feature_importance = sorted(zip(X.columns, importance), key=lambda x: x[1], reverse=True)

    important_features = [feat for feat, _ in feature_importance[:top_features]]

# Uncomment following lines if complete list of feature importance is required
#     important_features_data = final_data[important_features] 


elif feature_importance_technique == 'LGBM':
    values = lgbm_importance(X, y)
    importance = abs(values).mean(axis=0)
    feature_importance = sorted(zip(X.columns, importance), key=lambda x: x[1], reverse=True)

    important_features = [feat for feat, _ in feature_importance[:top_features]]

# Uncomment following lines if complete list of feature importance is required
#     important_features_data = final_data[important_features] 

else : 
    feature_importance = randomforest_importance(X, y)

    important_features = feature_importance['Feature'].head(top_features).to_list()

# Uncomment following lines if complete list of feature importance is required
#     important_features_data = final_data[important_features] 

end_time = time.time()
print("Elapsed time is ", round(end_time - start_time, 4), 'seconds.')

In [None]:
# Gives the plot of top features with thier importance

if feature_importance_technique == 'LOFO' :
    plot_importance(feature_importance, figsize=(10,10))

elif feature_importance_technique == 'XGB' :
    sns.set_style('whitegrid')
    shap.summary_plot(values, features = X, feature_names = X.columns, plot_type = 'bar', plot_size = 0.5)

elif feature_importance_technique == 'LGBM' :
    sns.set_style('whitegrid')
    shap.summary_plot(values, features = X, feature_names = X.columns, plot_type = 'bar', plot_size = 0.5)

else :
    sns.set_style('whitegrid')
    sns.set(rc={'figure.figsize':(11,10)})
    sns.barplot(x = 'Importance', y = 'Feature', data = feature_importance, color='b')
    

In [None]:
# List of important features

print("List of top ",top_features," features from ",feature_importance_technique," technique")
important_features