## Feature Binning

##### 1. Classify featues into Numeric, Categoric and Datetime feature
##### 2. Create Bins for based on DecisionTrees
##### 3. Generate stability plots for created Bins
##### 4. Generate statements to be plugged-in the table view

- Uses Decision Trees to create bins for Numerical and Cateorical Features
- Recommended to use OFF data since it doesnt contain affected metric
- Numbers of Bins created can be controlled/limited by data volume and hyperparameters
- Metric to create Bins and Stability Plots of created Bins can be based on same or different metrics
- If Binomial Metric is used for binning; change 'DecisionTreeRegressor' to 'DecisionTreeClassifier' and scoring from 'r2' to 'accuracy'
- Naming convention in case statements are generic; change to reflect actual naming convention for your need

### Load Packages

In [1]:
# Import Packages needed

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly as pp
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import sqlalchemy as sa
from datetime import *
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sqlalchemy import create_engine as ce
import json

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings
warnings.filterwarnings("ignore")

### Set Parameters

In [2]:
# list of unbinned features that are needed to be binned 
features = ["unbinned_feature_1",
               "unbinned_feature_2", 
               "unbinned_feature_3", 
               "unbinned_feature_4", 
               "unbinned_feature_5", 
               "unbinned_feature_6", 
               "unbinned_feature_7"]

# list of all target metrics that migth be used for further analysis;  small letters
targets = ['main_target_metric', 
           'target_metric_1', 
           'target_metric_2']            

target = 'main_target_metric'                   # target metric that will be used by decision tree;  small letters

clip_metric = 'yes'                             # Clip metric values : 'yes' or 'no'
clip_metric_limit = [0, 3000]                   # Min, Max values to limit metric


# Identify datetime col, if prefix/postfix already known (and does not appear in any other type of feature name)
# Any datetime feature name prefix/postfix except time_column
datetime_cols_id = ['dt_']                    

In [3]:
# Recommended not to use large datasets/large time periods. Often 30-35 days of data is enough to create reasonable bins
# Very large datasets would end up creating granular bins; If using large datasets, then pruning should be done accordingly

start_date = "'2023-05-01'"                         # Date is inclusive
end_date = "'2023-08-15'"                           # Date is inclusive

ip = 'xxx.xx.xxx.xxx'                               # IP address
port = xxxx
user = 'username'                                   # username
pass_ai = 'password'                                # password user 
db = 'dbname'                                       # name of schema

main_table = 'table_name'                           # name of table
filter = "isrelevant=1 and on_off = 0"              # filter for table
time_column = 'calltime'                            # column name in table for calltime
prefix = 'week'                                     # For week-wise ('week') or daily ('date') stability charts for created bins      

dates_to_filter = "('2022-05-01')"                  # If some dates needs to be removed from data

# There could be many different ways NAs could be present in dataset. All the following will be converted to numpy.nan
nas_to_replace = ['NA', 'NULL', 'NUL', 'NaN', '[NA]', 'nan', 'NAN', ' ']

In [4]:
# new case statements are created for binned features that are provided in a text file of the following name and identity

dataset_name = 'my_data'                      # Name of table/dataset, to be used as postfix for binned features for identification
statement_filename = 'my_data'                # Name to be added as postfix in case statement file name

#### Data Fetch

In [5]:
# data is fetched through following sql query

query = "Select " + time_column + ','+','.join(features)+','+','.join(targets)+" from " + main_table + " where " + time_column + " >= " + start_date +" and " + time_column + " <= " + end_date +" and " + filter +" and and " +time_column+" not in "+dates_to_filter+" ;"
print(query)

ai_conn = ce('mysql://'+user+':'+pass_ai+'@'+ip+':'+str(port)+'/'+db)
data = pd.read_sql(query,ai_conn)

print("data fetched successfully : ", data.shape)

In [6]:
# change column names to lower case
data = data.rename(columns = lambda x: x.lower())

# replace desired values with NAs
for col in data.columns:
    data[col] = data[col].replace(nas_to_replace, np.nan)
    
# Clip metric if required
if clip_metric == 'yes':
    data[target] = data[target].astype(float).clip(clip_metric_limit[0], clip_metric_limit[1])
    
# drop [targets] and [time_column] from data to classify features in the next cell
data1 = data.drop(targets, axis=1)
data1 = data1.drop(time_column, axis=1)

In [37]:
# Columns from dataset are identified by trying data types and checking the ValueErrors

numeric_cols = []                       # Columns only containing numeric datatype
categorical_cols = []                   # Columns only containing categorical datatype
datetime_cols = []                      # Columns only containing datetime datatype
mixed_cols = []                         # Columns containing multiple dataypes. Will be binned as categorical columns later in the script

for col in data1.columns:
    column_values = data1[col].dropna()
    
# If there are any errors raised when changing the values to numeric then dont add as numeric feature
    try:
        values_numeric = pd.to_numeric(column_values, errors='raise')
        numeric_cols.append(col)
    except (ValueError, TypeError):
        mixed_cols.append(col)
        
# Check types of each column value and classify if their dtypes are number or datetime, else its categorical
    unique_types = set(type(value).__name__ for value in column_values)

# If value type is just 1 then its uniquely one type else it would be classified as mixed
    if len(unique_types) ==1:
        if np.issubdtype(column_values.dtype, np.number):
            numeric_cols.append(col)
        elif np.issubdtype(column_values.dtype, np.datetime64):
            datetime_cols.append(col)
        else:
            categorical_cols.append(col)
    else:
        mixed_cols.append(col)
        
for col in mixed_cols:
    column_values = data1[col].dropna()
    try:
        values_datetime = pd.to_datetime(column_values, errors='raise')
        datetime_cols.append(col)
    except (ValueError, TypeError):
        categorical_cols.append(col)

In [None]:
# Check if there is any feature repeated in multiple feature types
repeated_features = []

for list1 in [numeric_cols, categorical_cols, mixed_cols, datetime_cols]:
    repeated_features += [x for x in set(list1) if list1.count(x) > 1]

if len(datetime_cols_id) >=1:
    for col in data1.columns:
        for i in datetime_cols_id:
            if i in col and col not in datetime_cols:
                datetime_cols.append(col)

numeric_cols = [x for x in numeric_cols if x not in datetime_cols]
categorical_cols = [x for x in categorical_cols if x not in datetime_cols]
mixed_cols = [x for x in mixed_cols if x not in datetime_cols]

for col in mixed_cols:
    if col in numeric_cols:
        numeric_cols.remove(col)
    if col in categorical_cols:
        categorical_cols.remove(col)

for col in numeric_cols:
    if col in categorical_cols:
        categorical_cols.remove(col)

repeated_features = list(set(repeated_features))
numeric_cols = list(set(numeric_cols))
categorical_cols = list(set(categorical_cols))
mixed_cols = list(set(mixed_cols))
datetime_cols = list(set(datetime_cols))

print('Repeated_features b/w categorical and mixed features : ',repeated_features , 
      '\nNumeric Features count : ',len(numeric_cols), 
      '\nCategorical Features count : ',len(categorical_cols),
      '\nMixed/Categorical Features count : ',len(mixed_cols), 
      '\nDatetime Features count : ', len(datetime_cols))

### Functions

In [9]:
# to convert dataframe into 1D array list
def flatten(l):
    return [item for sublist in l for item in sublist]

In [10]:
# To create bins for both Numerical and Categorical features
# data_type refers to the type of feature : "numerical" or "categorical"
# daa refers to tge data table
# parameters refer to the hyperparameters used for gridsearch
# cols refer to the type of features : "numerical_cols" or "categorical_cols"

def feature_bins(data_type, data, params, cols):

# describe Tree Model
    tree_model= DecisionTreeRegressor()
    cv = GridSearchCV(tree_model, param_grid=params, scoring='r2', cv=5)

    binning_data = data[['Num', time_column]]

# go through each feature iteratively
    for i in tqdm(cols):
        train = data[['Num',i, target]]
        train.dropna(inplace=True)

# convert Categorical values into coded values
        if data_type == 'categorical':
            train[i+'_coded'] = train[i].astype('category').cat.codes
            cv.fit(train[i+'_coded'].values.reshape(-1,1), train[target].values.reshape(-1,1))
        else:
            cv.fit(train[i].values.reshape(-1,1), train[target].values.reshape(-1,1))

# check for best hyperparametric values for Tree
        tree_model = DecisionTreeRegressor(max_depth= cv.best_params_['max_depth'], min_samples_leaf= cv.best_params_['min_samples_leaf'], min_impurity_decrease=cv.best_params_['min_impurity_decrease'])

# fit Tree to create bins
        if data_type == 'categorical':
            tree_model.fit(train[i+'_coded'].values.reshape(-1,1), train[target].values.reshape(-1,1))

            bins = sorted(set(tree_model.tree_.threshold))
            labels = ['bin'+str(j) for j in range(len(bins)-1)]
            train[str(i)+'_Tree_Leaf'] = pd.cut(train[i+'_coded'], bins=bins, labels=labels)

            binning_data = pd.merge(binning_data, train[['Num', i, i+'_coded', i+'_Tree_Leaf']], on='Num', how='left')

        else:
            tree_model.fit(train[i].values.reshape(-1,1), train[target].values.reshape(-1,1))

            on_leaf = tree_model.apply(train[i].values.reshape(-1,1))
            train[i+'_Bins'] = on_leaf

            binning_data = pd.merge(binning_data,train[['Num',i, i+'_Bins']], on = 'Num', how='left')

# combine feature bin values
    binning_data = pd.merge(binning_data,data[['Num',target]], on = 'Num', how='inner')
    
    return binning_data

In [None]:
# to plot stability plots of the binned features
# data_type refers to the type of feature : "numerical" or "categorical"
# table refers to the generated table that contains binned feature values
# data refers to hte data table
# names refer to the name of features binned
# prefix refers to the time duration used for stability plots : "week" or :"date"
# fig_list refer to the variable in which stability plots are saved

def plotting(final_data, feature_list, prefix, figs): 
# Creating stability plots. 
# There are 3 plots in a single column. 1- Proportion of individual value 2- Granularity wise average target value 3- Granulairty wise rank of each value in feature

    # going through list of features one by one
    for j in tqdm(range(len(feature_list))):

        # create a subplot with 1 row and 3 columns
        fig = make_subplots(rows=1, cols=3, subplot_titles = [prefix + "proportion", prefix + "average target", prefix + "rank"])    
        feature = feature_list[j]
        
        # Calculating proportion of each distinct value in feature
        plotting_data = final_data.groupby([prefix, feature]).size().unstack()
        week_vols = []
        for i in range(len(plotting_data)):
            week_vols.append(plotting_data.iloc[i].sum())
            plotting_data.iloc[i] = plotting_data.iloc[i]/week_vols[i] * 100

        # Filling NaNs to 0 in plotting data
        plotting_data = plotting_data.fillna(0)

        # Creating plots and saving in fig variable
        for i, col in enumerate(plotting_data.columns):
                fig.add_trace(go.Bar(name = col, x=plotting_data.index, y=plotting_data[col]), row=1, col=1)

        # Calculating average target metric for each distinct feature value
        plotting_data = final_data.groupby([prefix, feature])[target].mean().unstack()
        plotting_data = plotting_data.fillna(0)
        for i, col in enumerate(plotting_data.columns):
            fig.add_trace(go.Scatter(x = plotting_data.index, y=plotting_data[col], mode='lines+markers', name=col), row=1, col=2)

        # Calculating rank (based on average target metric value) for each distinct feature value
        plotting_data = final_data.groupby([prefix, feature])[target].mean().unstack()
        plotting_data = plotting_data.fillna(0)
        for i in range(len(plotting_data)):
            plotting_data.iloc[i] = ss.rankdata(plotting_data.iloc[i])

        for i, col in enumerate(plotting_data.columns):
            fig.add_trace(go.Scatter(x = plotting_data.index, y=plotting_data[col], mode='lines+markers', name=col), row=1, col=3)


        fig.update_xaxes(title_text=prefix)
        fig.update_yaxes(title_text="Proportion", row=1, col=1)
        fig.update_yaxes(title_text="Average "+target, row=2, col=1)
        fig.update_yaxes(title_text="Rank", row=3, col=1)
        fig.update_layout(title=feature)
        fig.update_layout(barmode='stack')
        
        figs.append(fig)
        return figs


# to convert figs plots into hml file

def figures_to_html(figs, filename = 'dashboard.html'):
    dashboard = open(filename, 'w')
    dashboard.write("<html><head></head><body>" + "\n")
    for fig in figs:
        inner_html = fig.to_html().split('<body>')[1].split('</body>')[0]
        dashboard.write(inner_html)
    dashboard.write("</body>")

#### Numeric Features

In [13]:
# remove time_column from datetime_cols if exists
if time_column in datetime_cols:
    datetime_cols.remove(time_column)

# make new data table with just numerical cols, datetime cols, calltime and target metrics
numeric_data = data[numeric_cols+targets+datetime_cols+[time_column]]
numeric_data[target] = numeric_data[target].astype(float)

In [14]:
# convert datetime cols into numerical cols with "days", 'weeks' and 'months' duration difference
if len(datetime_cols) >=1:
    current_date = pd.to_datetime(pd.Timestamp.today().date())
    processed_datetime_cols = []

    for col in datetime_cols:
        numeric_data[col] = pd.to_datetime(numeric_data[col], errors = 'coerce')
        numeric_data['days_since_'+str(col)] = (current_date - numeric_data[col]).dt.days
        numeric_data['weeks_since_'+str(col)] = (current_date - numeric_data[col]).dt.days //7
        numeric_data['months_since_'+str(col)] = (current_date.year - numeric_data[col].dt.year)*12 + (current_date.month - numeric_data[col].dt.month)
        processed_datetime_cols.append(['days_since_'+str(col), 'weeks_since_'+str(col), 'months_since_'+str(col)])

    numeric_data = numeric_data.drop(datetime_cols, axis=1)

    processed_datetime_cols = flatten(processed_datetime_cols)

In [None]:
# remove target metrics and time_column from numerical cols
numeric_cols = numeric_data.columns.to_list()
for i in targets:
    if i in numeric_cols:
        numeric_cols.remove(i)
        
numeric_cols = [x for x in numeric_cols if x not in targets]
numeric_cols.remove(time_column)

# processing of numerical data before binning
for f in numeric_cols:
    numeric_data[f] = pd.to_numeric(numeric_data[f], errors='coerce')
    numeric_data[f] = numeric_data[f].astype(float).apply(lambda x: round(x, 1))

numeric_data['Num'] = numeric_data.index

print('After processing')
print(numeric_data.shape)

In [None]:
# describe hyperparametric values for gridsearch. Change/update these values according to the volume/kind of numeric features
params ={ 
   'min_samples_leaf':range(1000,10000,1000),
   'max_depth':[2,3],
   'min_impurity_decrease':[0,0.001, 0.005, 0.01, 0.05],
   'min_samples_split' : range(500, 10000, 500),
   'max_features' : [1, 2, 3],
   'splitter' : ['best', 'random']
}

# feature binning

numeric_data = feature_bins('numeric', numeric_data, params, numeric_cols)

In [27]:
# creating a list of tables containing binned features and their min, max values
Tree_Bins = []
for i in numeric_cols:
    Tree_Bins.append(numeric_data[[i, i+'_Bins']].dropna().groupby(i+'_Bins').agg([np.min, np.max]).round())

In [None]:
# creating a table containing features and their respective number of bins created

Bins=[]
for i in range(len(Tree_Bins)):
    Bins.append(tuple((Tree_Bins[i].columns[1][0],len(Tree_Bins[i].index.values))))
    
Bins_Table = pd.DataFrame(Bins, columns= ['Feature','Bins'])
Bins_Table

In [20]:
# create "date" and 'week' columns based on time_column for stability plots

# numeric_data[target] = numeric_data[target].astype(float)
numeric_data['date']= pd.to_datetime(numeric_data[time_column]).dt.date
numeric_data['week']= pd.to_datetime(numeric_data[time_column]).dt.week

# feature named to be used in creating stability plots

names = []
for j in range(len(Tree_Bins)):
    names.append(Tree_Bins[j].index.name)

# Plotting stability plots and savinf in variable "figs"
fig_list = []
figs = plotting(numeric_data, names, prefix, fig_list)


In [None]:
# create tables describing min, max values of each bin for each feature

for i, j in enumerate(Tree_Bins):
    j=j.reset_index()
    j = j.rename(columns = {j.index.name:'Bins'})
    fig, ax =plt.subplots(1,1)
    ax.axis('off')
    table = ax.table(cellText=j.values,colLabels=j.columns,loc="center", colWidths = [0.5, 0.5, 0.5], cellLoc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(14)
    table.scale(2, 2)

In [24]:
# create case statements to be plugged-in to create binned features in SQL
# case statements for original numerical cols and numerical datetime cols are created separately

# remove numerical datetime cols
if len(datetime_cols) >= 1:
    for col in processed_datetime_cols:
        if col in numeric_cols:
            numeric_cols.remove(col)

# update the numeric_cols names with "_Bins"
leaf_num_cols = []
for col in numeric_cols:
    leaf_num_cols.append(col+str('_Bins'))
        
# case statements for each feature will be created according to the following logic
# first line is for NULL values given NA
# first bin will have "<=" for max bin value. For eg. if bin_1 has min = -1.0 and max = 5.0, then it will give statement "... When feature_1 <= 5.0 then '<=5.0' ..."
# second onward bins will be given bin values either '=' if bin is created with just 1 value (might happen when just 1 value has alot of occurences), it will give statement "... When feature_1 = 5.0 then '5.0' ..."
# or if its min and max values are different then it will create statement "... When feature_1 bewtween 5.0 and 10.0 then '5.0 - 10.0' ..."
# last bin statement will have '>=' for min bin values. for eg. if bin_5 has min = 100.0 and max = 500.0 then it will give statement "... When feature_1 >=100.0 then '>=100.0' ..."
# statement ends with dataset_name_bin_feature_name. Chnage if other naming convention is used. Also remember to update this change in create_cerise_json function as well

with open(statement_filename+".txt","w") as f:
    for i in range(len(Tree_Bins)):
        if Tree_Bins[i].index.name in leaf_num_cols:
            bin_values = Tree_Bins[i]
            j = len(bin_values)
            print( "(CASE WHEN (IFNULL(",numeric_cols[i],", 'NA') = 'NA') THEN 'NA'",file=f)
            if bin_values.columns[1][1] == 'amax':
                print("WHEN ",numeric_cols[i]," <= ",bin_values.iloc[0][1], " THEN "'" <= ',bin_values.iloc[0][1],'"',file=f)
                if j >= 2:
                    for m in range(j-2):
                        if bin_values.iloc[m+1][0] == bin_values.iloc[m+1][1]:
                            print("WHEN ",numeric_cols[i]," = " ,bin_values.iloc[m+1][1] ," THEN "'"',bin_values.iloc[m+1][1],'"',file=f)
                        else:
                            print("WHEN ",numeric_cols[i]," BETWEEN " ,bin_values.iloc[m+1][0] ," AND ", bin_values.iloc[m+1][1]," THEN "'"',bin_values.iloc[m+1][0],"-",bin_values.iloc[m+1][1],'"',file=f)
                    print("WHEN ",numeric_cols[i]," >=" ,bin_values.iloc[j-1][0]," THEN "'" >= ',bin_values.iloc[j-1][0],'"',file=f)
                else:
                    print("WHEN ",numeric_cols[i]," >=" ,bin_values.iloc[1][0]," THEN "'" >= ',bin_values.iloc[1][0],'"',file=f)
            else:
                print("WHEN ",numeric_cols[i]," <= ",bin_values.iloc[0][1], " THEN "'" <= ',bin_values.iloc[0][1],'"',file=f)
                if j>=2:
                    for m in range(j-2):
                        if bin_values.iloc[m+1][0] == bin_values.iloc[m+1][1]:
                            print("WHEN ",numeric_cols[i]," = " ,bin_values.iloc[m+1][1] ," THEN "'" ',bin_values.iloc[m+1][1],'"',file=f)
                        else:
                            print("WHEN ",numeric_cols[i]," BETWEEN " ,bin_values.iloc[m+1][1] ," AND ", bin_values.iloc[m+1][0]," THEN "'"',bin_values.iloc[m+1][1],"-",bin_values.iloc[m+1][0],'"',file=f)
                    print("WHEN ",numeric_cols[i]," >=" ,bin_values.iloc[j-1][1]," THEN "'" >= ',bin_values.iloc[j-1][1],'+"',file=f)
                else:
                    print("WHEN ",numeric_cols[i]," >=" ,bin_values.iloc[1][1]," THEN "'" >= ',bin_values.iloc[1][1],'+"',file=f)
            print("END) AS ",dataset_name+str("_")+"bin_%s"%numeric_cols[i],",",file=f)             

In [25]:
# case statements for dnumeric datetime features are created seperately here
# Logic is same as numeric features with some additions (time duration difference) 

if len(datetime_cols) == 0:
    print('No processed datetime columns available to be binned')
else:
    leaf_date_cols = []
    for col in processed_datetime_cols:
        leaf_date_cols.append(col+str('_Bins'))

    leaf_date = pd.DataFrame({'processed_datetime_cols' : processed_datetime_cols,
                             'tree_datetime_cols' : leaf_date_cols})

    leaf_date['type'] = leaf_date['processed_datetime_cols'].str.split('_since').str[0]
    leaf_date['type'] = [c.upper() for c in leaf_date['type']]

    with open(statement_filename+".txt","a") as f:
        for i in range(len(Tree_Bins)):
            for k in range(len(leaf_date)):
                if Tree_Bins[i].index.name == leaf_date['tree_datetime_cols'][k]:
                    bin_values = Tree_Bins[i]
                    j = len(bin_values)
                    print( "(CASE WHEN (IFNULL(",leaf_date['processed_datetime_cols'][k],", 'NA') = 'NA') THEN 'NA'",file=f)
                    if bin_values.columns[1][1] == 'amax':
                        print("WHEN 1.0* TIMESTAMPDIFF(",leaf_date['type'][k],",",leaf_date['processed_datetime_cols'][k],") <= ",bin_values.iloc[0][1], " THEN "'" <= ',bin_values.iloc[0][1],'"',file=f)
                        if j >= 2:
                            for m in range(j-2):
                                if bin_values.iloc[m+1][0] == bin_values.iloc[m+1][1]:
                                    print("WHEN 1.0* TIMESTAMPDIFF(",leaf_date['type'][k],",",leaf_date['processed_datetime_cols'][k],") = " ,bin_values.iloc[m+1][1] ," THEN "'"',bin_values.iloc[m+1][1],'"',file=f)
                                else:
                                    print("WHEN 1.0* TIMESTAMPDIFF(",leaf_date['type'][k],",",leaf_date['processed_datetime_cols'][k],") BETWEEN " ,bin_values.iloc[m+1][0] ," AND ", bin_values.iloc[m+1][1]," THEN "'"',bin_values.iloc[m+1][0],"-",bin_values.iloc[m+1][1],'"',file=f)
                            print("WHEN 1.0* TIMESTAMPDIFF(",leaf_date['type'][k],",",leaf_date['processed_datetime_cols'][k],") >=" ,bin_values.iloc[j-1][0]," THEN "'" >= ',bin_values.iloc[j-1][0],'"',file=f)
                        else:
                            print("WHEN 1.0* TIMESTAMPDIFF(",leaf_date['type'][k],",",leaf_date['processed_datetime_cols'][k],") >=" ,bin_values.iloc[1][0]," THEN "'" >= ',bin_values.iloc[1][0],'"',file=f)
                    else:
                        print("WHEN 1.0* TIMESTAMPDIFF(",leaf_date['type'][k],",",leaf_date['processed_datetime_cols'][k],") <= ",bin_values.iloc[0][1], " THEN "'" <= ',bin_values.iloc[0][1],'"',file=f)
                        if j>=2:
                            for m in range(j-2):
                                if bin_values.iloc[m+1][0] == bin_values.iloc[m+1][1]:
                                    print("WHEN 1.0* TIMESTAMPDIFF(",leaf_date['type'][k],",",leaf_date['processed_datetime_cols'][k],") = " ,bin_values.iloc[m+1][1] ," THEN "'" ',bin_values.iloc[m+1][1],'"',file=f)
                                else:
                                    print("WHEN 1.0* TIMESTAMPDIFF(",leaf_date['type'][k],",",leaf_date['processed_datetime_cols'][k],") BETWEEN " ,bin_values.iloc[m+1][1] ," AND ", bin_values.iloc[m+1][0]," THEN "'"',bin_values.iloc[m+1][1],"-",bin_values.iloc[m+1][0],'"',file=f)
                            print("WHEN 1.0* TIMESTAMPDIFF(",leaf_date['type'][k],",",leaf_date['processed_datetime_cols'][k],") >=" ,bin_values.iloc[j-1][1]," THEN "'" >= ',bin_values.iloc[j-1][1],'+"',file=f)
                        else:
                            print("WHEN 1.0* TIMESTAMPDIFF(",leaf_date['type'][k],",",leaf_date['processed_datetime_cols'][k],") >=" ,bin_values.iloc[1][1]," THEN "'" >= ',bin_values.iloc[1][1],'+"',file=f)
                    print("END) AS ",dataset_name+str("_")+"bin_%s"%leaf_date['processed_datetime_cols'][k],",",file=f)             

#### Categorical Features

In [39]:
# categroical features are combination of categroical and mixed features (treating both as categorical) and removing any duplicate feature
cat_cols = categorical_cols + mixed_cols

cate_cols=[]
for item in cat_cols:
    if item not in cate_cols:
        cate_cols.append(item)

# make new data table with just categroical cols, calltime and target metrics
cate_data = data[cate_cols+[target]+[time_column]]
cate_data[target] = cate_data[target].astype(float)
cate_data['Num'] = cate_data.index

In [None]:
# processing categorical features; change dtype to 'str', apply smalls threshold

smalls_thresh = 0.005

for f in cate_cols:
    cate_data[f] = cate_data[f].astype(str)
    cate_data[f] = cate_data[f].replace(nas_to_replace, np.nan)
    counts = cate_data[f].value_counts(normalize=True)
    mapping = cate_data[f].map(counts)
    cate_data[f] = cate_data[f].mask(mapping < smalls_thresh, 'smalls')
print('After processing')
print(cate_data.shape)

In [None]:
# describe hyperparametric values for gridsearch. Change/update these values according to the volume/kind of categoric features.

params ={ 
   'min_samples_leaf':range(1000,10000,1000),
   'max_depth':[2,3],
   'min_impurity_decrease':[0,0.001, 0.005, 0.01, 0.05],
   'min_samples_split' : range(500, 10000, 500),
   'max_features' : [1, 2, 3],
   'splitter' : ['best', 'random']
}

# feature binning
# usually takes more time than numeric feature binning. Depends on hyperparametric values as well

categorical_data = feature_bins('categorical', cate_data, params, cate_cols)

In [44]:
# create table containing categoric features with binned values using groupby function

Tree_Bins = []
for i in cate_cols:
    Tree_Bins = categorical_data[['Num', i , i+'_Tree_Leaf']]
#     Tree_Bins = Tree_Bins[~(Tree_Bins[i].isna()|Tree_Bins[i].isnull()|(Tree_Bins[i] == 'nan'))]
    Tree_Bins.dropna(inplace=True)
    Tree_Bins[i+'_bins'] = Tree_Bins.groupby(i+'_Tree_Leaf')[i].transform(lambda x: ','.join("'"+item+"'" for item in set(x)))
    categorical_data = pd.merge(categorical_data, Tree_Bins[['Num', i+'_bins']], on='Num', how='left')

In [None]:
# create table containing categorical features, number of bins created and bin values comma seperated

bins = []
for i in cate_cols:
    list1 = []
    list2 = []
    for j in range(categorical_data[i+'_Tree_Leaf'].dropna().nunique()):
        list1.append([categorical_data[i+'_bins'].dropna().unique()[j]])
    bins.append(tuple((i, categorical_data[i+'_Tree_Leaf'].dropna().nunique(), list1)))
    
Bins_Table = pd.DataFrame(bins, columns= ['Categorical_Feature','Bins_value', 'Bins'])

Bins_Table.set_index('Categorical_Feature', inplace=True)
Bins_Table.reset_index(inplace=True)

pd.set_option('display.max_colwidth', None)
Bins_Table[['Categorical_Feature','Bins_value', 'Bins']]

In [32]:
# create case statement for categorical features that will update with numerical features case statement
# if seperate file is needed, then change 'a' to 'w' in the first line below and change name of the file as needed
# case statement for each feature will be created according to the following logic
# first line is for NULL values given NA
# bin names are automatically given as "Bin_0", "Bin_1",... according to the number of bins created
# statement ends with dataset_name_bin_feature_name. Change if other naming convention is used. Also remember to update this change in create_cerise_json function as well

with open(statement_filename+".txt","a") as f:
    for i in range(len(Bins_Table)):
        bin_values = Bins_Table.iloc[i]
        k = bin_values['Bins_value']
#         bin_values['Bins_comma'].dropna().unique().to_list()
        print( "(CASE WHEN (IFNULL(",bin_values['Categorical_Feature'],", 'NA') = 'NA') THEN 'NA'",file=f)
        print("WHEN ",bin_values['Categorical_Feature']," in (",bin_values['Bins'][0][0], ") THEN '" , str('Bin_0'),"'",file=f)
        if k >= 2:
            for m in range(k-1):
                print("WHEN ",bin_values['Categorical_Feature']," in (",bin_values['Bins'][m+1][0], ") THEN '" , str('Bin_')+str(m+1),"'",file=f)
        print("else 'smalls' END) AS ",dataset_name+"_bin_%s"%bin_values['Categorical_Feature'], ",",file=f) 

In [33]:
# create "date" and "week" columns based on time_column for stability plots

# categorical_data[target] = categorical_data[target].astype(float)
categorical_data[time_column] = pd.to_datetime(categorical_data[time_column])
categorical_data['date']= categorical_data.calltime.dt.date
categorical_data['week']= categorical_data.calltime.dt.week

# features names to be used in creating stability plots

names = []

for j in range(len(Bins_Table)):
    names.append(Bins_Table['Categorical_Feature'][j]+'_Tree_Leaf')

# Plotting stability plots and saving in variable "figs"
    
figs = plotting(categorical_data, names, prefix, figs)

#### Stability Plots for both Numerical and Categorical binned features

In [34]:
# creating HTML file for combined stability plots of numerical and categorical features

figures_to_html(figs, './Stability_binned_features.html')