# env config

In [None]:
import os
import sys
import shutil
import json

username = 'guxia'
repo_dir = f'/projects/gds-focus/data/{username}/task/'
secret_path = f'/projects/{username}/secret/'
os.chdir(f'{repo_dir}/variable_check/')
for p in [secret_path]:
    if p in sys.path:
        continue
    sys.path.append(p)

import pandas as pd
import numpy as np

import aml.cloud_v1 as cloud

%reload_ext cloudmagics.bigquery
%config PPMagics.domain="ccg24-hrzana-gds-focus"
%config PPMagics.autolimit=0
%url -c horton
%matplotlib inline


In [None]:
def cross_entropy(p, q):
    epsilon = 1e-10
    p = np.copy(p)
    q = np.copy(q)
    p[np.isnan(p)] = epsilon
    q[np.isnan(q)] = epsilon
    
    p = np.clip(p, epsilon, 1 - epsilon)
    q = np.clip(q, epsilon, 1 - epsilon)
    ce = -np.sum(p*np.log(q))
    return ce


def psi(p, q):
    epsilon = 1e-10
    p = np.copy(p)
    q = np.copy(q)
    p[np.isnan(p)] = epsilon
    q[np.isnan(q)] = epsilon
    
    p = np.clip(p, epsilon, 1 - epsilon)
    q = np.clip(q, epsilon, 1 - epsilon)
    psi = np.sum((q - p) * np.log(q/p))
    return psi


def plot_bar(var, base, challenger, bin_boundary):
    x = np.arange(len(base))
    width = 0.2

    fig, ax = plt.subplots(figsize = (10,4))
    ax.bar(x - width / 2, base, width=width, label='base')
    ax.bar(x + width / 2, challenger, width=width, label='challenger')
    ax.set_xticks(x)
    ax.set_xticklabels(bin_boundary, rotation=45, ha='right')
    ax.legend()
    ax.grid()
    ax.set_title(var)
    fig.tight_layout()
    
    return fig, ax


# eval variable

In [None]:
# walmart bopis
model_switch_score_data_path = 'data/switch_val/latam23_switch_score_walmart_bopis.parquet'

# variable bin info result
bin_vec_save_path = 'cbp_202301_202307_bin_vec_qcut.json'

# variable feature importance path
var_fi_path = 'latam23_cbp_202306_model_var_fi.csv'

categorical_var_path = 'categorical.txt'

score_name = 'model_811_is_cc_bad'

guanjia_model_dir = 'guanjia_model_dir_latam23'

driver_data_dir = 'data/cbp_madmen_parquet_20230601_20230630'

walmart_rcvr_id = '1704664984735735782'

In [None]:
%%time
import matplotlib.pyplot as plt

from pyScoring import UMEModel



# score df
print(f'loading variable switch score from {model_switch_score_data_path}')
score_df = pd.read_parquet(model_switch_score_data_path)
score_df = score_df.reset_index(drop=True)
print(score_df.shape)

# variable bin distriution
print(f'loading variable bin distribution from {bin_vec_save_path}')
with open(bin_vec_save_path,) as f:
    var_bin_vec_df = pd.DataFrame(json.loads(f.read()))
    

# variable fi
print(f'loading variable fi from {var_fi_path}')
var_fi_df = pd.read_csv(var_fi_path)


variables = var_fi_df['variable'].to_list()
print(f'total variable num:', len(variables))


categorical = pd.read_csv(categorical_var_path, names=['var'])['var'].to_list()
print(f'loaded categorical variable num:', len(categorical))


ume_model_path = [f for f in os.listdir(guanjia_model_dir) if '.m' in f][0]
ume_model_path = os.path.join(guanjia_model_dir, ume_model_path)
print(f'loading model from {ume_model_path}')
ume_model = UMEModel(ume_model_path)

In [None]:
%%time

# load driver

print(f'loading driver from {driver_data_dir}')
driver_df = pd.read_parquet(driver_data_dir)
driver_df = driver_df.replace(['', 'None', 'NONE', 'NA', '-999.0', -999.0], np.nan)

print('loaded driver rec num', driver_df.shape)


# validate variable FI

In [None]:
var = 'v51_cc_bin_prod'

t = driver_df[driver_df['rcvr_id'] == walmart_rcvr_id].head(1000).copy(deep=True).reset_index(drop=True)
    
score1 = ume_model.predict(t[ume_model.inputs])[score_name].rename(f'{var}_original')

# to safe direction
t[var] = 'NO_RADD_DATA'

score2 = ume_model.predict(t[ume_model.inputs])[score_name].rename(f'{var}_switch')

score = pd.concat([score1, score2], axis=1)

diff = score[f'{var}_original'] - score[f'{var}_switch']

print('original score mean', score[f'{var}_original'].mean())

print('mean', diff.mean())

print('max', diff.max())

print('min', diff.min())

print('std', diff.std())

print('p50', diff.quantile(0.5))

print('p95', diff.quantile(0.95))


In [None]:
# plot bar
import matplotlib.pyplot as plt

# non-Walmart non-BOPIS
base_seg_val = '0*0'

# Walmart non-BOPIS
challenger_seg_val = '1*0'

var = 'v51_cc_bin_prod'

base_vec = var_bin_vec_df[(var_bin_vec_df['variable'] == var)
                          & (var_bin_vec_df['seg_val'] == base_seg_val)]['bin_distribution'].iloc[0]
base_vec = [np.nan if v == 'NaN' else v for v in base_vec]

challenger_vec = var_bin_vec_df[(var_bin_vec_df['variable'] == var)
                                & (var_bin_vec_df['seg_val'] == challenger_seg_val)]['bin_distribution'].iloc[0]
challenger_vec = [np.nan if v == 'NaN' else v for v in challenger_vec]

x = np.arange(len(base_vec))
bin_boundary = var_bin_vec_df[(var_bin_vec_df['variable'] == var)]['bin_boundary'].iloc[0]
print(bin_boundary)

plot_bar(var, base_vec, challenger_vec, bin_boundary)


# get top variables that have significant distribution difference

In [None]:
# Walmart non-BOPIS
base_seg_val = '1*0'

# Walmart BOPIS
challenger_seg_val = '1*1'


var_ce_df = pd.DataFrame(columns=['variable', 'ce',])

for var in variables:
    base_vec = var_bin_vec_df[(var_bin_vec_df['variable'] == var)
                      & (var_bin_vec_df['seg_val'] == base_seg_val)]['distribution'].iloc[0]
    base_vec = [np.nan if v == 'NaN' else v for v in base_vec]
    
    challenger_vec = var_bin_vec_df[(var_bin_vec_df['variable'] == var)
                      & (var_bin_vec_df['seg_val'] == challenger_seg_val)]['distribution'].iloc[0]
    challenger_vec = [np.nan if v == 'NaN' else v for v in challenger_vec]
    ce = cross_entropy(base_vec, challenger_vec)
    
    var_ce_df = var_ce_df.append({'variable': var, 'ce': ce, }, ignore_index=True)

    
var_ce_df.sort_values(by='ce', ascending=False).head(20)