### Data Addition
Compile all additions to the data here

Update cleaned_BankChurner (or whatever the final dataset name will be) with new data

In [None]:
# libraries
import numpy as np
import pandas as pd
from sklearn.neighbors import KernelDensity

In [2]:
# load datasets
bank_churners = pd.read_csv('../data/processed/BankChurners_more.csv') # main dataset
bank_products = pd.read_csv('../data/raw/banking_product_services.csv') # dataset with different products and services

### Data Addition:

Add dataset name, link to dataset, and other relevant details

##### Subgroup B Q1: [Bank Services and Products](https://www.kaggle.com/datasets/akhilups/insurance-product-purchase-prediction?resource=download)


**Product Options** (`A`, `B`, `C`, `D`, `E`, `F`, `G`)

Each product has 7 customizable options selected by customers, each with 2, 3, or 4 ordinal values possible:

A product is simply a vector with length 7 whose values are chosen from each of the options listed above. The cost of a product is a function of both the product options and customer characteristics.

`A`: credit cards\
`B`: savings accounts\
`C`: investments\
`D`: personal insurance\
`E`: commercial insurance\
`F`: personal\

 loans
`G`: commercial loans


Columns Added: `A`, `B`, `C`, `D`, `E`, `F`, `G`, `risk_factor`

In [4]:
def generate_new_data(df_target, df_source, colname, lowerlim=0, kernel='gaussian'):
    '''
    Returns dataframe with new data added to it from the source
    Inputs:
    df_target: dataframe where data will be added
    df_source: dataframe where data is sourced from
    colname: column name to be added
    Optional inputs:
    lowerlim: lower limit of data added 
    upperlim: upper limit of data added
    kernel: type of distribution used
    Returns:
    A dataframe with new data added
    '''
    kde_new = KernelDensity(bandwidth=1, kernel=kernel)
    kde_new.fit(df_source[colname].values.reshape(-1, 1))

    new_samples = kde_new.sample(len(df_target)).reshape(-1)        
    df_target[colname] = np.round(new_samples).astype(int)

    df_target.loc[df_target[colname] < lowerlim, colname] = lowerlim
    
    return df_target

In [None]:
# Merge datasets by getting cluster distribution
df_target = bank_churners.copy()
products_df = bank_products
join_cols = ['Dependent_count', 'Transaction_Category']
cols_to_merge = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'risk_factor']

# Obtain categories in main dataset
products_df['risk_factor'] = products_df['risk_factor'].fillna(products_df['risk_factor'].median()) 
products_df = products_df.fillna(0)

tenth_source = products_df['cost'].quantile(0.10)
ninetieth_source = products_df['cost'].quantile(0.90)

products_df['Transaction_Category'] = np.where(
    products_df['cost'] <= tenth_source, 'low',
    np.where(products_df['cost'] >= ninetieth_source, 'high', 'medium')
)
products_df['group_size'] = products_df['group_size']-1
products_df.rename(columns={'group_size': 'Dependent_count'}, inplace=True)

# Obtain categories in source dataset
tenth_target = df_target['Total_Trans_Amt'].quantile(0.10)
ninetieth_target = df_target['Total_Trans_Amt'].quantile(0.90)

df_target['Transaction_Category'] = np.where(
    df_target['Total_Trans_Amt'] <= tenth_target, 'low',
    np.where(df_target['Total_Trans_Amt'] >= ninetieth_target, 'high', 'medium')
)
df_target['Dependent_count'] = np.where(df_target['Dependent_count'] > 3, 3, df_target['Dependent_count'])
df_target = df_target[['CLIENTNUM']+join_cols]


In [None]:
# Generate new data by using cluster distribution
dependentcount = df_target['Dependent_count'].unique()
transcat = df_target['Transaction_Category'].unique()

for col in cols_to_merge:
    for c in dependentcount:
        for t in transcat:
            #  Filter the main dataset for the current segment
            data = df_target[
                (df_target['Dependent_count'] == c) &
                (df_target['Transaction_Category'] == t)
            ]
            #  Filter the source dataset to match the current segment
            source = products_df[
                (products_df['Dependent_count'] == c) &
                (products_df['Transaction_Category'] == t)
            ]

            # Use cluster distribution to get new data
            data = generate_new_data(data, source, col)

            # Update main dataset
            df_target.loc[
                (df_target['Dependent_count'] == c) &
                (df_target['Transaction_Category'] == t),
                col
            ] = data[col]

# Obtain dataset with 'CLIENTNUM' as primary key
bank_data = df_target[['CLIENTNUM']+cols_to_merge]


In [None]:
# Merge new data with main dataset
df = bank_churners.merge(bank_data, on='CLIENTNUM')
display(df)

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Month_with_bank,...,Total_Trans_Count,Avg_Utilization_Ratio,A,B,C,D,E,F,G,risk_factor
0,768805383,1,45,1,3,High School,1,2,1,39,...,42,0.061,0.0,0.0,3.0,2.0,1.0,0.0,0.0,1.0
1,818770008,1,49,0,5,Graduate,0,4,1,44,...,33,0.105,1.0,0.0,4.0,2.0,0.0,0.0,0.0,3.0
2,713982108,1,51,1,3,Graduate,1,3,1,36,...,20,0.000,2.0,0.0,0.0,3.0,0.0,0.0,3.0,3.0
3,709106358,1,40,1,3,Uneducated,1,2,1,21,...,28,0.000,1.0,0.0,0.0,3.0,0.0,0.0,3.0,1.0
4,713061558,1,44,1,2,Graduate,1,1,1,36,...,24,0.311,0.0,0.0,1.0,2.0,1.0,0.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110119,739115251,1,32,0,1,Graduate,0,4,1,16,...,79,0.627,2.0,0.0,2.0,2.0,1.0,0.0,3.0,0.0
110120,531828724,1,37,0,2,Uneducated,0,4,1,36,...,86,0.599,2.0,1.0,1.0,2.0,2.0,3.0,3.0,4.0
110121,916155331,1,42,1,5,Graduate,1,3,1,28,...,89,0.000,0.0,0.0,0.0,4.0,0.0,1.0,1.0,5.0
110122,447168581,1,47,0,4,College,1,4,1,26,...,86,0.007,0.0,1.0,0.0,5.0,0.0,2.0,2.0,4.0


### Save dataset generated

In [18]:
df.to_csv('../data/processed/product_services_data.csv')