In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.cluster import KMeans
from sdv.single_table import GaussianCopulaSynthesizer
from sklearn.preprocessing import MinMaxScaler

## Data cleaning

In [3]:
original = pd.read_csv("../data/BankChurners.csv")
balance_df = pd.read_csv("../data/botswana_bank_customer_churn.csv")

In [4]:
# removing irrelevent columns
original = original.drop(original.columns[[-1, -2]], axis=1)
original = original.drop(columns=['Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1','Contacts_Count_12_mon','Total_Ct_Chng_Q4_Q1'])

# renaming the datasets
original = original.rename(columns={'Months_on_book' : 'Month_with_bank',
                                    'Total_Relationship_Count' : 'No_of_product',
                                    'Total_Trans_Ct' : 'Total_Trans_Count'})

# removing Na from the dataset
original_Unknown = original[original.isin(['Unknown']).any(axis=1)] # someone handle the unknown please
original = original[~original.isin(['Unknown']).any(axis=1)]


In [5]:
# this function will remove the k,$ and + sign in the income category column
def clean_col(x):
        if 'K' in x:
            return x.replace('K','').replace('$','')
        elif '+' in x:
            return x.replace('+','')
        elif x =='Less than 40':
            return x.split()[2]
        return x
    
original['Income_Category']=original['Income_Category'].apply(clean_col)

In [6]:
# Converting object into category
categorical_features = ['Attrition_Flag','Gender','Education_Level','Marital_Status','Income_Category','Card_Category']
for category in categorical_features:
    original[category] = original[category].astype('category')

## Generating synthetic data

### Feature selection

In [7]:
# subsetting from the original data
required_features = ["CLIENTNUM","Customer_Age","Gender","Income_Category","No_of_product"]
subset_original = original.loc[:,required_features]

# subsetting from the income data
required_features2 = ['Date of Birth','Gender','Income','NumOfProducts']
subset_balance = balance_df.loc[:,required_features2]

### Data Cleaning

In [8]:
# Changing date of birth to date time and them convert it to age
subset_balance['Date of Birth'] = pd.to_datetime(subset_balance['Date of Birth'])
reference_date = pd.Timestamp('2024-01-01')
subset_balance['Date of Birth'] = reference_date.year - subset_balance['Date of Birth'].dt.year

# Changing income into income category
bins = [0, 40000, 60000, 80000, 120000, float('inf')]
labels = ['Less than 40', '40 - 60', '60 - 80', '80 - 120', '120 +']
subset_balance['Income'] = pd.cut(subset_balance['Income'], bins=bins, labels=labels, right=False)

# Recoding Male to M...
subset_balance['Gender'] = subset_balance['Gender'].replace({'Male':'M','Female':'F'})

# Renaming the balance_subset dataframe
subset_balance = subset_balance.rename(columns = {'Date of Birth' : "Customer_Age", 
                                        'Income' : "Income_Category", 'NumOfProducts':"No_of_product"})

# Converting object to category
subset_balance['Gender'] = subset_balance['Gender'].astype('category')

# Scaling 
scaler = MinMaxScaler()
subset_balance[['Customer_Age','No_of_product']] = scaler.fit_transform(subset_balance[['Customer_Age','No_of_product']])
subset_original[['Customer_Age','No_of_product']] = scaler.fit_transform(subset_original[['Customer_Age','No_of_product']])


### Savings and Investment

In [9]:
#Savings
from sklearn.cluster import KMeans

original_copy = original
sav = pd.read_csv("../data/credit_score.csv") 
# https://www.kaggle.com/datasets/conorsully1/credit-score?resource=download

# convert income into income category then one-hot encoding
bins = [0, 40000, 60000, 80000, 120000, float('inf')]
labels = ['Less than 40', '40 - 60', '60 - 80', '80 - 120', '120 +']
sav['Income_Category'] = pd.cut(sav['INCOME'], bins = bins, labels = labels, right = False)
sav = pd.get_dummies(sav, columns=['Income_Category'], drop_first=True)

# clustering
features = sav[['Income_Category_40 - 60', 'Income_Category_60 - 80', 'Income_Category_80 - 120', 'Income_Category_120 +']]

kmeans = KMeans(n_clusters=3)
sav['Cluster'] = kmeans.fit_predict(features)

# fitting normal distribution
stats = sav.groupby('Cluster')['SAVINGS'].agg(['mean', 'std']).reset_index()
stats.columns = ['Cluster', 'Mean', 'SD']

def sample(cluster):
    mean = stats.loc[stats['Cluster'] == cluster, 'Mean'].values[0]
    sd = stats.loc[stats['Cluster'] == cluster, 'SD'].values[0]
    return round(max(np.random.normal(mean, sd), 0), 2) #avoid savings being negative & change to 2 decimal places

sav['Savings'] = sav['Cluster'].apply(sample)

# fitting into original
original_copy = pd.get_dummies(original_copy, columns=['Income_Category'], drop_first = False)
original_copy['Cluster'] = kmeans.predict(original_copy[['Income_Category_40 - 60', 'Income_Category_60 - 80', 'Income_Category_80 - 120', 'Income_Category_120 +']])
original_copy['Savings'] = original_copy['Cluster'].apply(sample)
original['Savings'] = original_copy['Savings']

In [10]:
# Investment
from faker import Faker

faker = Faker()
# building fake investment based on their income level and savings
def investment(row):
    base = faker.random_int(1200, 12000) #invest $100-$1000 per month
    if row['Income_Category'] == 'Less than 40':
        factor = 1
    elif row['Income_Category'] == '40 - 60':
        factor = 1.5
    elif row['Income_Category'] == '60 - 80':
        factor = 2
    elif row['Income_Category'] == '80 - 120':
        factor = 2
    elif row['Income_Category'] == '120 +':
        factor = 2.5
    # take into account their savings
    save = min(row['Savings']/10000, 5)
    if row['Savings'] == 0.00:
        save = 1 #they can still invest with no savings
    # if their savings is small: using a basing investment off a portion of that
    investment = base * factor * save
    return round(investment, 2)

original['Investment'] = original.apply(investment, axis = 1)

### Fitting model, KMEANS

In [11]:
original.iloc[4]

CLIENTNUM                         713061558
Attrition_Flag            Existing Customer
Customer_Age                             44
Gender                                    M
Dependent_count                           2
Education_Level                    Graduate
Marital_Status                      Married
Income_Category                     40 - 60
Card_Category                          Blue
Month_with_bank                          36
No_of_product                             3
Months_Inactive_12_mon                    1
Credit_Limit                         4010.0
Total_Revolving_Bal                    1247
Total_Trans_Amt                        1088
Total_Trans_Count                        24
Avg_Utilization_Ratio                 0.311
Savings                           248451.92
Investment                          76650.0
Name: 5, dtype: object

In [12]:
min(113.31 / 10000, 2.0)


0.011331

In [None]:
### Fee Sensitivity

In [13]:
fake = Faker()

def generate_fake_data(num_samples):
    fake_data = []
    for _ in range(num_samples):
        loan_amount = np.random.randint(100, 10000)  # Random loan amount between 100 and 10,000
        account_balance = np.random.randint(0, 20000)  # Random balance between 0 and 20,000
        fee_sensitivity = np.random.uniform(1.0, 5.0)  # Random fee sensitivity between 1 and 5
        fake_data.append({
            'fee_sensitivity': fee_sensitivity,
            'outstanding_loan': loan_amount,
            'account_balance': account_balance
        })
    return pd.DataFrame(fake_data)

new_data = generate_fake_data(100)

# Clustering
kmeans = KMeans(n_clusters=3)  # I chose 3 clusters
new_data['cluster'] = kmeans.fit_predict(new_data[['fee_sensitivity']])

# Normal distributions for each cluster
clusters_stats = {}
for cluster in new_data['cluster'].unique():
    cluster_data = new_data[new_data['cluster'] == cluster]
    mean_loan = cluster_data['outstanding_loan'].mean()
    std_loan = cluster_data['outstanding_loan'].std()
    mean_balance = cluster_data['account_balance'].mean()
    std_balance = cluster_data['account_balance'].std()
    clusters_stats[cluster] = {
        'outstanding_loan': (mean_loan, std_loan),
        'account_balance': (mean_balance, std_balance)
    }


def sample_fake_data(row):
    # Assume a placeholder for fee sensitivity since it's not in original
    cluster_label = kmeans.predict([[row.get('fee_sensitivity', np.nan)]])[0]
    mean_loan, std_loan = clusters_stats[cluster_label]['outstanding_loan']
    mean_balance, std_balance = clusters_stats[cluster_label]['account_balance']
    
    # Sample from the normal distribution
    sampled_loan = np.random.normal(mean_loan, std_loan)
    sampled_balance = np.random.normal(mean_balance, std_balance)
    
    return pd.Series([sampled_loan, sampled_balance])

# Add fake data to the original dataset
original[['sampled_outstanding_loan', 'sampled_account_balance']] = original.apply(sample_fake_data, axis=1)

print(original.head())



ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

# Campaign Effectiveness

In [None]:
campaign = pd.read_csv("../data/campaign_data.csv", sep = ';') 
# https://www.kaggle.com/datasets/prakharrathi25/banking-dataset-marketing-targets?select=test.csv
# training data was used since it was much larger than test data (randomly selected rows from training data)

# data that will be added
# campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
# pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
# previous: number of contacts performed before this campaign and for this client (numeric)
# poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")
# y - has the client subscribed a term deposit? (binary: "yes","no")

# campaign (using normal d/b)
np.random.seed(10) # setting seed for reproducibility
mean_campaign = campaign['campaign'].mean()
sd_campaign = campaign['campaign'].std()
n = len(original)
original['Num_of_Contacts_Made'] = np.random.normal(mean_campaign, sd_campaign, n)
original['Num_of_Contacts_Made'] = np.round(np.maximum(original['Num_of_Contacts_Made'], 0), 0)

# pdays (using normal d/b)
np.random.seed(10) # setting seed for reproducibility
mean_pdays = campaign['pdays'].mean()
sd_pdays = campaign['pdays'].std()
original['Last_Contacted'] = np.random.normal(mean_pdays, sd_pdays, n)
original['Last_Contacted'] = np.round(np.maximum(original['Last_Contacted'], -1), 0)

# previous (using normal d/b)
np.random.seed(10) # setting seed for reproducibility
mean_previous = campaign['previous'].mean()
sd_previous = campaign['previous'].std()
original['Last_Campaign_Contact'] = np.random.normal(mean_previous, sd_previous, n)
original['Last_Campaign_Contact'] = np.round(np.maximum(original['Last_Campaign_Contact'], 0), 0)

# poutcomes
poutcome_counts = campaign['poutcome'].value_counts()
poutcomes = poutcome_counts.index
n_campaign = len(campaign)
p_poutcome = (poutcome_counts/n_campaign).tolist()
original['Last_Campaign_Outcome'] = np.random.choice(poutcomes, size = n, p = p_poutcome)

# y 
y_counts = campaign['y'].value_counts()
y_outcomes = y_counts.index
p_y = (y_counts/n_campaign).tolist()
original['Outcome'] = np.random.choice(y_outcomes, size = n, p = p_y)

Digital Banking Behaviour

Dataset: https://www.kaggle.com/datasets/mikhail1681/user-churn

relevant features: PhoneService, InternetService, TechSupport

In [18]:
original.info()


<class 'pandas.core.frame.DataFrame'>
Index: 7081 entries, 0 to 10126
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   CLIENTNUM               7081 non-null   int64   
 1   Attrition_Flag          7081 non-null   category
 2   Customer_Age            7081 non-null   int64   
 3   Gender                  7081 non-null   category
 4   Dependent_count         7081 non-null   int64   
 5   Education_Level         7081 non-null   category
 6   Marital_Status          7081 non-null   category
 7   Income_Category         7081 non-null   category
 8   Card_Category           7081 non-null   category
 9   Month_with_bank         7081 non-null   int64   
 10  No_of_product           7081 non-null   int64   
 11  Months_Inactive_12_mon  7081 non-null   int64   
 12  Credit_Limit            7081 non-null   float64 
 13  Total_Revolving_Bal     7081 non-null   int64   
 14  Total_Trans_Amt         7081

In [21]:
print(original.head())

   CLIENTNUM     Attrition_Flag  Customer_Age Gender  Dependent_count  \
0  768805383  Existing Customer            45      M                3   
1  818770008  Existing Customer            49      F                5   
2  713982108  Existing Customer            51      M                3   
4  709106358  Existing Customer            40      M                3   
5  713061558  Existing Customer            44      M                2   

  Education_Level Marital_Status Income_Category Card_Category  \
0     High School        Married         60 - 80          Blue   
1        Graduate         Single    Less than 40          Blue   
2        Graduate        Married        80 - 120          Blue   
4      Uneducated        Married         60 - 80          Blue   
5        Graduate        Married         40 - 60          Blue   

   Month_with_bank  No_of_product  Months_Inactive_12_mon  Credit_Limit  \
0               39              5                       1       12691.0   
1             

In [17]:
technical = pd.read_csv("../data/User churn.csv")

# print(original.head())
# original.info()
technical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


In [20]:
# Define common features
common_features = ['gender', 'SeniorCitizen', 'Partner', 'Churn', 'Dependents']

# Calculate the distribution of PhoneService
phone_service_distribution = (
    technical.groupby(common_features)['PhoneService']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
)

# Calculate the distribution of InternetService
internet_service_distribution = (
    technical.groupby(common_features)['InternetService']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
)

# Calculate the distribution of TechSupport
tech_support_distribution = (
    technical.groupby(common_features)['TechSupport']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
)

print(phone_service_distribution)
print(internet_service_distribution)
print(tech_support_distribution)


PhoneService                                         No       Yes
gender SeniorCitizen Partner Churn Dependents                    
Female 0             No      No    No          0.112211  0.887789
                                   Yes         0.154545  0.845455
                             Yes   No          0.076923  0.923077
                                   Yes         0.062500  0.937500
                     Yes     No    No          0.076763  0.923237
                                   Yes         0.096503  0.903497
                             Yes   No          0.131579  0.868421
                                   Yes         0.072581  0.927419
       1             No      No    No          0.075472  0.924528
                                   Yes         0.000000  1.000000
                             Yes   No          0.101266  0.898734
                                   Yes         0.000000  1.000000
                     Yes     No    No          0.044118  0.955882
          

In [28]:
original['gender'] = original['Gender'].apply(lambda x: 'Male' if x == 'M' else 'Female')
original['SeniorCitizen'] = (original['Customer_Age'] > 60).astype(int)
original['Partner'] = original['Marital_Status'].apply(lambda x: 'Yes' if x == 'Married' else 'No')
original['Dependents'] = original['Dependent_count'].apply(lambda x: 'Yes' if x > 0 else 'No')
original['Churn'] = original['Attrition_Flag'].apply(lambda x: 'Yes' if x == 'Existing Customer' else 'No')

original.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Month_with_bank,...,Total_Trans_Count,Avg_Utilization_Ratio,Savings,Investment,gender,SeniorCitizen,Partner,Dependents,churn,Churn
0,768805383,Existing Customer,45,M,3,High School,Married,60 - 80,Blue,39,...,42,0.061,395572.03,32990.0,Male,0,Yes,Yes,Yes,Yes
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than 40,Blue,44,...,33,0.105,451608.84,11750.0,Female,0,No,Yes,Yes,Yes
2,713982108,Existing Customer,51,M,3,Graduate,Married,80 - 120,Blue,36,...,20,0.0,453635.75,111510.0,Male,0,Yes,Yes,Yes,Yes
4,709106358,Existing Customer,40,M,3,Uneducated,Married,60 - 80,Blue,21,...,28,0.0,529301.53,14500.0,Male,0,Yes,Yes,Yes,Yes
5,713061558,Existing Customer,44,M,2,Graduate,Married,40 - 60,Blue,36,...,24,0.311,248451.92,76650.0,Male,0,Yes,Yes,Yes,Yes


In [30]:
def generate_synthetic_features(df, distribution, feature):
    synthetic_feature = []
    
    for _, row in df.iterrows():
        features = tuple(row[common_features])
        
            # Use the distribution to sample
        sample = np.random.choice(
            distribution.columns,
            p=distribution.loc[features].values
        )
        synthetic_feature.append(sample)
            
    return synthetic_feature

# Generate synthetic features
original['PhoneService'] = generate_synthetic_features(original, phone_service_distribution, 'PhoneService')
original['InternetService'] = generate_synthetic_features(original, internet_service_distribution, 'InternetService')
original['TechSupport'] = generate_synthetic_features(original, tech_support_distribution, 'TechSupport')

original.head()


Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Month_with_bank,...,Investment,gender,SeniorCitizen,Partner,Dependents,churn,Churn,PhoneService,InternetService,TechSupport
0,768805383,Existing Customer,45,M,3,High School,Married,60 - 80,Blue,39,...,32990.0,Male,0,Yes,Yes,Yes,Yes,No,Fiber optic,No
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than 40,Blue,44,...,11750.0,Female,0,No,Yes,Yes,Yes,Yes,DSL,Yes
2,713982108,Existing Customer,51,M,3,Graduate,Married,80 - 120,Blue,36,...,111510.0,Male,0,Yes,Yes,Yes,Yes,Yes,DSL,No
4,709106358,Existing Customer,40,M,3,Uneducated,Married,60 - 80,Blue,21,...,14500.0,Male,0,Yes,Yes,Yes,Yes,Yes,Fiber optic,No
5,713061558,Existing Customer,44,M,2,Graduate,Married,40 - 60,Blue,36,...,76650.0,Male,0,Yes,Yes,Yes,Yes,Yes,Fiber optic,No
