#### Load Packages

In [None]:
# Import Packages needed
 
import pandas as pd
import numpy as np
import sqlalchemy as sa
from sqlalchemy import create_engine as ce
from datetime import *
from tqdm import tqdm
import scipy.stats as ss
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly    

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings
warnings.filterwarnings("ignore")

### Set Parameters

In [2]:
# Add sme features that you want to check stability. 
features = ["feature_1",
               "feature_2", 
               "feature_3", 
               "feature_4", 
               "feature_5", 
               "feature_6", 
               "feature_7"]

In [3]:
# list of all target metrics that migth be used for further analysis;  small letters

targets = ['main_target_metric', 
           'target_metric_1', 
           'target_metric_2']           

target = 'main_target_metric'                    # target metric that will be used by decision tree;  small letters
binary_metric = 'binary_target_metric'

clip_metric = 'yes'                              # Clip metric values : 'yes' or 'no'
clip_metric_limit = [0, 3000]                    # Min, Max values to limit metric

In [4]:


# unexpected_numeric_values = {'original' : 999999999.0, 'replace' : 999999.0}
# unexpected_string_values = {'original' : '999999999.0', 'replace' : '999999.0'}
# special_char = ['$', '&', '%']

# There could be many different ways NAs could be present in dataset. All the following will be converted to numpy.nan
nas_to_replace = ['NA', 'NULL', 'NUL', 'NaN', '[NA]', 'nan', 'NAN', ' ']

# Identify datetime col, if prefix/postfix already known (and does not appear in any other type of feature name)
# Any datetime feature name prefix/postfix except time_column
datetime_cols_id = ['dt_'] 

In [5]:
start_date = "'2023-05-01'"                         # Date is inclusive
end_date = "'2023-08-15'"                           # Date is inclusive

ip = 'xxx.xx.xxx.xxx'                               # IP address
port = xxxx
user = 'username'                                   # username
pass_ai = 'password'                                # password user 
db = 'dbname'                                       # name of schema

main_table = 'table_name'                           # name of table
filter = "isrelevant=1 and on_off = 0"              # filter for table
time_column = 'calltime'                            # column name in table for calltime
granularity = 'week'                                # For week-wise ('week') or daily ('date') stability charts     

dates_to_filter = "('2022-05-01')"                  # If some dates needs to be removed from data

### Data Fetch

In [6]:
# data is fetched through following sql query

query = "Select " + time_column + ','+','.join(features)+','+','.join(targets)+" from " + main_table + " where " + time_column + " >= " + start_date +" and " + time_column + " <= " + end_date +" and " + filter +" and and " +time_column+" not in "+dates_to_filter+" ;"
print(query)

ai_conn = ce('mysql://'+user+':'+pass_ai+'@'+ip+':'+str(port)+'/'+db)
data = pd.read_sql(query,ai_conn)

print("data fetched successfully : ", data.shape)

In [8]:
# change column names to lower case
data = data.rename(columns = lambda x: x.lower())

# replace desired values with NAs
for col in data.columns:
    data[col] = data[col].replace(nas_to_replace, np.nan)
    
# Clip metric if required
if clip_metric == 'yes':
    data[target] = data[target].astype(float).clip(clip_metric_limit[0], clip_metric_limit[1])
    

### Feature Stability plots

In [17]:
# Join all datasets on a common index type column with all unique values
final_data = data.copy()

# Convert time column to datetime datatype
final_data[time_column] = pd.to_datetime(final_data[time_column])

# create a list of features without target metric and time column
drop = [target, time_column]
feature_list = final_data.drop(drop, axis=1).columns

# Convert all data into object datatype
for i in feature_list:
    final_data[i] = final_data[i].astype(object)

# create date and week columns 

final_data['date'] = final_data[time_column].dt.date
final_data['week'] = final_data[time_column].dt.isocalendar().week

In [18]:
# to convert figs plots into hml file

def figures_to_html(figs, filename = 'dashboard.html'):
    dashboard = open(filename, 'w')
    dashboard.write("<html><head></head><body>" + "\n")
    for fig in figs:
        inner_html = fig.to_html().split('<body>')[1].split('</body>')[0]
        dashboard.write(inner_html)
    dashboard.write("</body>")
    
fig_list = []

In [None]:
# Creating stability plots. 
# There are 3 plots in a single column. 1- Proportion of individual value 2- Granularity wise average target value 3- Granulairty wise rank of each value in feature

# going through list of features one by one
for j in tqdm(range(len(feature_list))):

    # create a subplot with 1 row and 3 columns
    fig = make_subplots(rows=1, cols=3, subplot_titles = [prefix + "proportion", prefix + "average target", prefix + "rank"])    
    feature = feature_list[j]
    
    # Calculating proportion of each distinct value in feature
    plotting_data = final_data.groupby([granularity, feature]).size().unstack()
    week_vols = []
    for i in range(len(plotting_data)):
        week_vols.append(plotting_data.iloc[i].sum())
        plotting_data.iloc[i] = plotting_data.iloc[i]/week_vols[i] * 100

    # Filling NaNs to 0 in plotting data
    plotting_data = plotting_data.fillna(0)

    # Creating plots and saving in fig variable
    for i, col in enumerate(plotting_data.columns):
            fig.add_trace(go.Bar(name = col, x=plotting_data.index, y=plotting_data[col]), row=1, col=1)

    # Calculating average target metric for each distinct feature value
    plotting_data = final_data.groupby([granularity, feature])[target].mean().unstack()
    plotting_data = plotting_data.fillna(0)
    for i, col in enumerate(plotting_data.columns):
        fig.add_trace(go.Scatter(x = plotting_data.index, y=plotting_data[col], mode='lines+markers', name=col), row=1, col=2)

    # Calculating rank (based on average target metric value) for each distinct feature value
    plotting_data = final_data.groupby([granularity, feature])[target].mean().unstack()
    plotting_data = plotting_data.fillna(0)
    for i in range(len(plotting_data)):
        plotting_data.iloc[i] = ss.rankdata(plotting_data.iloc[i])

    for i, col in enumerate(plotting_data.columns):
        fig.add_trace(go.Scatter(x = plotting_data.index, y=plotting_data[col], mode='lines+markers', name=col), row=1, col=3)


    fig.update_xaxes(title_text=granularity)
    fig.update_yaxes(title_text="Proportion", row=1, col=1)
    fig.update_yaxes(title_text="Average "+target, row=2, col=1)
    fig.update_yaxes(title_text="Rank", row=3, col=1)
    fig.update_layout(title=feature)
    fig.update_layout(barmode='stack')
    
    fig_list.append(fig)

In [20]:
# Stability plots are stored as html file on current working directory
figures_to_html(fig_list,'./features_stability.html')