In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

In [4]:
original = pd.read_csv("../../data/processed/BankChurners_cleaned.csv")
sav = pd.read_csv("../../data/credit_score.csv")
# https://www.kaggle.com/datasets/conorsully1/credit-score?resource=download

In [6]:
original_copy = original.copy()

# convert income into income category then one-hot encoding
bins = [0, 40000, 60000, 80000, 120000, float('inf')]
labels = ['Less than 40', '40 - 60', '60 - 80', '80 - 120', '120 +']
sav['Income_Category'] = pd.cut(sav['INCOME'], bins = bins, labels = labels, right = False)
sav = pd.get_dummies(sav, columns=['Income_Category'], drop_first=True)

# clustering
features = sav[['Income_Category_40 - 60', 'Income_Category_60 - 80', 'Income_Category_80 - 120', 'Income_Category_120 +']]

kmeans = KMeans(n_clusters = 5, n_init = 10)
sav['Cluster'] = kmeans.fit_predict(features)

# fitting normal distribution
stats = sav.groupby('Cluster')['SAVINGS'].agg(['mean', 'std']).reset_index()
stats.columns = ['Cluster', 'Mean', 'SD']

def sample(cluster):
    mean = stats.loc[stats['Cluster'] == cluster, 'Mean'].values[0]
    sd = stats.loc[stats['Cluster'] == cluster, 'SD'].values[0]
    return round(max(np.random.normal(mean, sd), 0), 2) #avoid savings being negative & change to 2 decimal places

sav['Savings'] = sav['Cluster'].apply(sample)

# fitting into original
original_copy = pd.get_dummies(original_copy, columns=['Income_Category'], drop_first = False)
original_copy['Cluster'] = kmeans.predict(original_copy[['Income_Category_40 - 60', 'Income_Category_60 - 80', 'Income_Category_80 - 120', 'Income_Category_120 +']])
original_copy['Savings'] = original_copy['Cluster'].apply(sample)
original['Savings'] = original_copy['Savings']

In [7]:
original.to_csv('../../data/processed/original (2).csv', index=False)