In [88]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.cluster import KMeans
from sdv.single_table import GaussianCopulaSynthesizer
from sklearn.preprocessing import MinMaxScaler

## Data cleaning

In [431]:
original = pd.read_csv("../data/BankChurners.csv")
balance_df = pd.read_csv("../data/botswana_bank_customer_churn.csv")

In [433]:
# removing irrelevent columns
original = original.drop(original.columns[[-1, -2]], axis=1)
original = original.drop(columns=['Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1','Contacts_Count_12_mon','Total_Ct_Chng_Q4_Q1'])

# renaming the datasets
original = original.rename(columns={'Months_on_book' : 'Month_with_bank',
                                    'Total_Relationship_Count' : 'No_of_product',
                                    'Total_Trans_Ct' : 'Total_Trans_Count'})

# removing Na from the dataset
original_Unknown = original[original.isin(['Unknown']).any(axis=1)] # someone handle the unknown please
original = original[~original.isin(['Unknown']).any(axis=1)]


In [435]:
# this function will remove the k,$ and + sign in the income category column
def clean_col(x):
        if 'K' in x:
            return x.replace('K','').replace('$','')
        elif '+' in x:
            return x.replace('+','')
        elif x =='Less than 40':
            return x.split()[2]
        return x
    
original['Income_Category']=original['Income_Category'].apply(clean_col)

In [437]:
# Converting object into category
categorical_features = ['Attrition_Flag','Gender','Education_Level','Marital_Status','Income_Category','Card_Category']
for category in categorical_features:
    original[category] = original[category].astype('category')

## Generating synthetic data

### Feature selection

In [417]:
# subsetting from the original data
required_features = ["CLIENTNUM","Customer_Age","Gender","Income_Category","No_of_product"]
subset_original = original.loc[:,required_features]

# subsetting from the income data
required_features2 = ['Date of Birth','Gender','Income','NumOfProducts']
subset_balance = balance_df.loc[:,required_features2]

### Data Cleaning

In [420]:
# Changing date of birth to date time and them convert it to age
subset_balance['Date of Birth'] = pd.to_datetime(subset_balance['Date of Birth'])
reference_date = pd.Timestamp('2024-01-01')
subset_balance['Date of Birth'] = reference_date.year - subset_balance['Date of Birth'].dt.year

# Changing income into income category
bins = [0, 40000, 60000, 80000, 120000, float('inf')]
labels = ['Less than 40', '40 - 60', '60 - 80', '80 - 120', '120 +']
subset_balance['Income'] = pd.cut(subset_balance['Income'], bins=bins, labels=labels, right=False)

# Recoding Male to M...
subset_balance['Gender'] = subset_balance['Gender'].replace({'Male':'M','Female':'F'})

# Renaming the balance_subset dataframe
subset_balance = subset_balance.rename(columns = {'Date of Birth' : "Customer_Age", 
                                        'Income' : "Income_Category", 'NumOfProducts':"No_of_product"})

# Converting object to category
subset_balance['Gender'] = subset_balance['Gender'].astype('category')

# Scaling 
scaler = MinMaxScaler()
subset_balance[['Customer_Age','No_of_product']] = scaler.fit_transform(subset_balance[['Customer_Age','No_of_product']])
subset_original[['Customer_Age','No_of_product']] = scaler.fit_transform(subset_original[['Customer_Age','No_of_product']])


### Savings and Investment

In [451]:
#Savings
from sklearn.cluster import KMeans

original_copy = original
sav = pd.read_csv("../data/credit_score.csv") 
# https://www.kaggle.com/datasets/conorsully1/credit-score?resource=download

# convert income into income category then one-hot encoding
bins = [0, 40000, 60000, 80000, 120000, float('inf')]
labels = ['Less than 40', '40 - 60', '60 - 80', '80 - 120', '120 +']
sav['Income_Category'] = pd.cut(sav['INCOME'], bins = bins, labels = labels, right = False)
sav = pd.get_dummies(sav, columns=['Income_Category'], drop_first=True)

# clustering
features = sav[['Income_Category_40 - 60', 'Income_Category_60 - 80', 'Income_Category_80 - 120', 'Income_Category_120 +']]

kmeans = KMeans(n_clusters=3)
sav['Cluster'] = kmeans.fit_predict(features)

# fitting normal distribution
stats = sav.groupby('Cluster')['SAVINGS'].agg(['mean', 'std']).reset_index()
stats.columns = ['Cluster', 'Mean', 'SD']

def sample(cluster):
    mean = stats.loc[stats['Cluster'] == cluster, 'Mean'].values[0]
    sd = stats.loc[stats['Cluster'] == cluster, 'SD'].values[0]
    return round(max(np.random.normal(mean, sd), 0), 2) #avoid savings being negative & change to 2 decimal places

sav['Savings'] = sav['Cluster'].apply(sample)

# fitting into original
original_copy = pd.get_dummies(original_copy, columns=['Income_Category'], drop_first = False)
original_copy['Cluster'] = kmeans.predict(original_copy[['Income_Category_40 - 60', 'Income_Category_60 - 80', 'Income_Category_80 - 120', 'Income_Category_120 +']])
original_copy['Savings'] = original_copy['Cluster'].apply(sample)
original['Savings'] = original_copy['Savings']

  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Month_with_bank,No_of_product,Months_Inactive_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Trans_Amt,Total_Trans_Count,Avg_Utilization_Ratio,Savings
0,768805383,Existing Customer,45,M,3,High School,Married,60 - 80,Blue,39,5,1,12691.0,777,1144,42,0.061,382432.76
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than 40,Blue,44,6,1,8256.0,864,1291,33,0.105,113478.31
2,713982108,Existing Customer,51,M,3,Graduate,Married,80 - 120,Blue,36,4,1,3418.0,0,1887,20,0.0,279771.55
4,709106358,Existing Customer,40,M,3,Uneducated,Married,60 - 80,Blue,21,5,1,4716.0,0,816,28,0.0,176644.77
5,713061558,Existing Customer,44,M,2,Graduate,Married,40 - 60,Blue,36,3,1,4010.0,1247,1088,24,0.311,0.0


In [501]:
# Investment
from faker import Faker

faker = Faker()
# building fake investment based on their income level and savings
def investment(row):
    base = faker.random_int(1200, 12000) #invest $100-$1000 per month
    if row['Income_Category'] == 'Less than 40':
        factor = 1
    elif row['Income_Category'] == '40 - 60':
        factor = 1.5
    elif row['Income_Category'] == '60 - 80':
        factor = 2
    elif row['Income_Category'] == '80 - 120':
        factor = 2
    elif row['Income_Category'] == '120 +':
        factor = 2.5
    # take into account their savings
    save = min(row['Savings']/10000, 5)
    if row['Savings'] == 0.00:
        save = 1 #they can still invest with no savings
    # if their savings is small: using a basing investment off a portion of that
    investment = base * factor * save
    return round(investment, 2)

original['Investment'] = original.apply(investment, axis = 1)

### Fitting model, KMEANS

In [515]:
original.iloc[4]

CLIENTNUM                         713061558
Attrition_Flag            Existing Customer
Customer_Age                             44
Gender                                    M
Dependent_count                           2
Education_Level                    Graduate
Marital_Status                      Married
Income_Category                     40 - 60
Card_Category                          Blue
Month_with_bank                          36
No_of_product                             3
Months_Inactive_12_mon                    1
Credit_Limit                         4010.0
Total_Revolving_Bal                    1247
Total_Trans_Amt                        1088
Total_Trans_Count                        24
Avg_Utilization_Ratio                 0.311
Savings                                 0.0
Investment                           9669.0
Name: 5, dtype: object

In [483]:
min(113.31 / 10000, 2.0)


0.011331