In [24]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from package.utils import DotDict, timer, get_config, load_data, merge_data

# from package.clustering import CONFIG_PATH

# conf = get_config(CONFIG_PATH)

from sklearn.preprocessing import StandardScaler

In [26]:
from package.clustering.get_model import get_model
from package.clustering.predict import predict_model, add_labels

# @timer
def train_model(model, X):
    return model.fit(X)

In [27]:
def run_test_data(raw_df, scaled_df):
    X = scaled_df.copy()

    label_df = None

    for k in range(2, 5):
        model_name = 'kmeans'
        model_param = {'n_clusters': k, 'random_state': 0}

        model = get_model(method=model_name, params=model_param)
        model = train_model(model=model, X=X)
        labels = predict_model(model=model, method=model_name, X=X)
        result = add_labels(X=X, labels=labels)
        result = result.rename(columns={'labels':f'k{k}'})
        result = result.set_index('party_rk')
        label_df = pd.concat(objs=[label_df, result], axis=1)

    main_df = pd.concat(objs=[raw_df, label_df], axis=1)
    main_df.index.name = 'party_rk'

    return main_df.reset_index()

In [28]:
groupbehavior_list = ['99pct', '1hold', '0anp', 'other']
groupfamily_list = ['family', 'nofamily']
grouprider_list = ['rider', 'norider']
grouplifestage_list = ['61+']
group_list = []
for behavior in groupbehavior_list:
    for family in groupfamily_list:
        for rider in grouprider_list:
            for lifestage in grouplifestage_list:
                group = f'{behavior}_{family}_{rider}_{lifestage}'
                group_list.append(group)

In [29]:

df_dict = {'group':[], 'size':[]}
for group in group_list:
    path = f's3://tli-crm-segmentation/data-zones/neutral/grouped/feature_{group}.csv'
    raw_df = load_data(path=path).set_index('party_rk')
    df_dict['group'].append(group)
    df_dict['size'].append(len(raw_df))
    
size_df = pd.DataFrame(data=df_dict)
total_cus = size_df['size'].sum()
size_df['percent'] = size_df['size']/total_cus

function: load_data is starting...
loading data from: s3://tli-crm-segmentation/data-zones/neutral/grouped/feature_99pct_family_rider_61+.csv
Succesfully loaded data from: s3://tli-crm-segmentation/data-zones/neutral/grouped/feature_99pct_family_rider_61+.csv
function: load_data successfully executed at 0.10896706581115723s
function: load_data is starting...
loading data from: s3://tli-crm-segmentation/data-zones/neutral/grouped/feature_99pct_family_norider_61+.csv
Succesfully loaded data from: s3://tli-crm-segmentation/data-zones/neutral/grouped/feature_99pct_family_norider_61+.csv
function: load_data successfully executed at 0.039394378662109375s
function: load_data is starting...
loading data from: s3://tli-crm-segmentation/data-zones/neutral/grouped/feature_99pct_nofamily_rider_61+.csv
Succesfully loaded data from: s3://tli-crm-segmentation/data-zones/neutral/grouped/feature_99pct_nofamily_rider_61+.csv
function: load_data successfully executed at 0.07498455047607422s
function: loa

In [30]:
size_df

Unnamed: 0,group,size,percent
0,99pct_family_rider_61+,886,0.000991
1,99pct_family_norider_61+,880,0.000984
2,99pct_nofamily_rider_61+,1580,0.001767
3,99pct_nofamily_norider_61+,4532,0.005069
4,1hold_family_rider_61+,16272,0.0182
5,1hold_family_norider_61+,54007,0.060406
6,1hold_nofamily_rider_61+,120549,0.134831
7,1hold_nofamily_norider_61+,366390,0.409798
8,0anp_family_rider_61+,1,1e-06
9,0anp_family_norider_61+,12378,0.013844


In [31]:
group_list.remove('0anp_family_rider_61+')
group_list.remove('0anp_nofamily_rider_61+')

In [32]:
landing_path = 's3://tli-crm-segmentation/data-test/load_dt=20250606'

for group in group_list:
    path = f's3://tli-crm-segmentation/data-zones/neutral/grouped/feature_{group}.csv'
    raw_df = load_data(path=path).set_index('party_rk')

    path = f's3://tli-crm-segmentation/data-zones/neutral/grouped/feature_normalized_{group}.csv'
    scaled_df = load_data(path=path).set_index('party_rk')
    
    if 'norider' in group:
        col_list = [col for col in scaled_df.columns if '_health' not in col and '_ci' not in col]
        scaled_df = scaled_df[col_list]
    
    proxy = run_test_data(raw_df, scaled_df)
    
    file_name = f'EXPERIMENT_CRM_SEGMENTATION_{group.upper()}_KMEANS'
    proxy.to_csv(f'{landing_path}/{file_name}.csv', index=False)

function: load_data is starting...
loading data from: s3://tli-crm-segmentation/data-zones/neutral/grouped/feature_99pct_family_rider_61+.csv
Succesfully loaded data from: s3://tli-crm-segmentation/data-zones/neutral/grouped/feature_99pct_family_rider_61+.csv
function: load_data successfully executed at 0.09837150573730469s
function: load_data is starting...
loading data from: s3://tli-crm-segmentation/data-zones/neutral/grouped/feature_normalized_99pct_family_rider_61+.csv
Succesfully loaded data from: s3://tli-crm-segmentation/data-zones/neutral/grouped/feature_normalized_99pct_family_rider_61+.csv
function: load_data successfully executed at 0.0695030689239502s
function: get_model is starting...
function: get_model successfully executed at 3.266334533691406e-05s
function: predict_model is starting...
function: predict_model successfully executed at 0.0003948211669921875s
function: get_model is starting...
function: get_model successfully executed at 3.0994415283203125e-05s
function: