# env config

In [None]:
# binning variable
# categorical binning

In [None]:
import os
import sys
import shutil

username = 'guxia'
repo_dir = f'/projects/gds-focus/data/{username}/task/'
secret_path = f'/projects/{username}/secret/'
os.chdir(f'{repo_dir}/variable_check/')
for p in [secret_path]:
    if p in sys.path:
        continue
    sys.path.append(p)

import pandas as pd
import numpy as np

%reload_ext cloudmagics.bigquery
%config PPMagics.domain="ccg24-hrzana-gds-focus"
%config PPMagics.autolimit=0
%url -c horton
%ppauth

# variable bin info

In [None]:


# full variable list file
variable_save_path = 'all_variables.txt'

driver_data_dir = 'data/cbp_madmen_parquet_202301_202307'

categorical_var_save_path = 'categorical.txt'

walmart_rcvr_id = '1704664984735735782'

# path to save variable bin info
bin_vec_save_path = 'cbp_202301_202307_bin_vec_qcut.json'

# segments that you want to run bin stats over.
# e.g., set segment_expr as is_walmart*is_bopis will run bin boundary over
# all possible combination of is_walmart & is_bopis.
segment_expr = 'is_walmart*is_bopis'

# default segment
seg_overall = '__OVERALL__'

bin_num = 10

In [None]:
%%time

import json
from py_dpu import rename_df, loadByDriver, load_pig, save_pig, load_parquet


# load variable
variables = pd.read_csv(variable_save_path, names=['col'])['col'].to_list()


# load driver
print('loading driver from', driver_data_dir)
df = pd.read_parquet(driver_data_dir)

df = df.replace(['', 'None', 'NONE', 'NA', '-999.0', -999.0], np.nan)

df['is_walmart'] = df['rcvr_id'].map(lambda x: 1 if x == walmart_rcvr_id else 0)

print('driver shape', {df.shape})



In [None]:
%%time
# get variable category

categorical = []

for var in variables:
    try:
        _ = df[var].astype(float)
    except ValueError as ve:
        print(f'{var}', ve)
        categorical.append(var)

print('total categorical num', len(categorical))

with open(categorical_var_save_path, 'w+') as f:
    f.write('\n'.join(sorted(categorical)))

In [None]:
def get_var_bin_info_v2(var, 
                        df, 
                        bin_num,
                        categorical,
                        numerical_cut='qcut',
                       ):
    """
    get binning info for variable var in df.
    
    :param var: variable name, should be found in df
    :param df: pandas dataframe of driver.
    :param bin_num: binning bucket num, for categorical, will be used to get most frequent categorical value.
    :param categorical: list of categorical variables.
    :param numerical_cut: numerical column binning method
    :return:
    """
    
    def _convert_cate(x, top_cate):
        if pd.isna(x):
            return np.nan
        if x in top_cate.index:
            return x
        return '__OTHER__'

    try:
        if var not in categorical:
            if numerical_cut == 'qcut':
                bins = [i*1.0 / bin_num for i in range(bin_num)] + [1.0]
                bin_boundary = pd.qcut(df[var].astype(float), q=bins, duplicates='drop')
            elif numerical_cut == 'cut':
                bin_boundary = pd.cut(df[var].astype(float), bins=bin_num, duplicates='drop')
        else:
            top_cate = df[var].value_counts(dropna=False).sort_values(ascending=False).head(bin_num-1)
            bin_boundary = df[var].map(lambda x: _convert_cate(x, top_cate))
    except Exception as e:
        print(f'exception for {var}')
        print(e)
        
    return bin_boundary


In [None]:
%%time

df[seg_overall] = '1'
segments = segment_expr.split('*')

# process and get variable bin info

def _wrap_data(var, seg_name, seg_val, bin_boundary, bin_cnt):
    data = {}
    
    bin_vec = bin_cnt / np.nansum(bin_cnt)
    data['variable'] = var
    data['seg_name'] = seg_name
    data['seg_val'] = seg_val
    data['bin_boundary'] = bin_boundary
    data['bin_cnt'] = ['NaN' if pd.isna(v) else 1.0*v for v in bin_cnt]
    data['bin_distribution'] = ['NaN' if pd.isna(v) else v for v in bin_vec]
    return data
    

ret_json = []

for var in variables:
    print(f'process {var}')
    bin_df = df[segments + [seg_overall] + [var]]

    bin_boundary = get_var_bin_info_v2(var, df, bin_num, categorical)
    
    bin_df = bin_df.join(bin_boundary.rename('bin_boundary'))
    
    # get bin counter
    seg_bin_cnt = bin_df.groupby(segments)['bin_boundary']\
                            .value_counts(dropna=False)\
                            .sort_index()\
                            .unstack(-1)
    
    overall_bin_cnt = bin_df.groupby(seg_overall)['bin_boundary']\
                            .value_counts(dropna=False)\
                            .sort_index()\
                            .unstack(-1)

    overall_bin_cnt = overall_bin_cnt.reindex(seg_bin_cnt.columns, axis=1)

    bin_label = ['NaN' if pd.isna(c) else str(c) for c in seg_bin_cnt.columns]
    
    # ensemble data
    for seg_name, bin_vec in zip([seg_overall, segment_expr],
                                 [overall_bin_cnt, seg_bin_cnt]):
        for index, row in bin_vec.iterrows():
            data = {}
            seg_val = '*'.join([str(v) for v in index])
            data = _wrap_data(var, seg_name, seg_val, bin_label, row.values)
            ret_json.append(data)
            
    
with open(bin_vec_save_path, 'w+') as f:
    f.write(json.dumps(ret_json, indent=4))

    