# Synthetic data generation for campaign effectiveness

We will be making use of the data from https://www.kaggle.com/datasets/prakharrathi25/banking-dataset-marketing-targets?select=test.csv for campaign effectiveness.

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from scipy.stats import norm

In [2]:
original = pd.read_csv("../data/processed/BankChurners_cleaned.csv")
campaign = pd.read_csv("../data/raw/campaign_data.csv", sep = ';')

In [3]:
# Checking for duplicates
campaign_duplicates = campaign.duplicated()

# Display the number of duplicate rows, if any
print(f"Number of duplicate rows in 'campaign': {campaign_duplicates.sum()}")

Number of duplicate rows in 'campaign': 0


# 1. Campaign data

Data that will be added:

- `duration`: last contact duration, in seconds (numeric) --> `Duration_of_Contact`
- `campaign`: number of contacts performed during this campaign and for this client (numeric, includes last contact) --> `Number_of_Contacts_Made`
- `pdays`: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted) --> `Last_Contact_Made`
- `y`: has the client subscribed a term deposit? (binary: "yes","no") --> `Outcome`

## 1.1 Feature selection

In [4]:
# similar columns: education, marital, age
common_features = ['education', 'marital', 'age']

ori = original.copy()
ori['education'] = ori['Education_Level']
ori['education'] = ori['education'].replace(
    {'Doctorate': 'tertiary',
     'Post-Graduate': 'tertiary',
     'Graduate': 'tertiary',
     'College': 'secondary',
     'High School': 'secondary', 
     'Uneducated': 'primary'})

ori['marital'] = ori['Marital_Status']
ori['marital'] = ori['marital'].replace(
    {'Married': 'married',
     'Single': 'single',
     'Divorced': 'divorced'})

ori['age'] = ori['Customer_Age']

In [5]:
# d/b of duration
duration_db = (
    campaign.groupby(common_features)['duration']
    .value_counts(normalize = True)
    .unstack(fill_value = 0)
)

# d/b of campaign
campaign_db = (
    campaign.groupby(common_features)['campaign']
    .value_counts(normalize = True)
    .unstack(fill_value = 0)
)

# d/b of pdays
pdays_db = (
    campaign.groupby(common_features)['pdays']
    .value_counts(normalize = True)
    .unstack(fill_value = 0)
)

# d/b of y
campaign['y'] = campaign['y'].replace({'yes': 1, 'no': 0})
y_db = (
    campaign.groupby(common_features)['y']
    .mean()
    .reset_index()
)

  campaign['y'] = campaign['y'].replace({'yes': 1, 'no': 0})


## 1.2 Sampling

In [6]:
# generating synthetic features
def generate_synthetic_features(df, distribution, feature):
    synthetic_feature = []
    
    for _, row in df.iterrows():
        features = tuple(row[common_features])
        try:
            sample = np.random.choice(
                distribution.columns,
                p=distribution.loc[features].values
            )
        except KeyError: #incase the feature combination is missing
            sample = np.random.choice(distribution.columns)
            
        synthetic_feature.append(sample)
            
    return synthetic_feature

def generate_synthetic_outcome(df, distribution):
    synthetic_outcome = []
    
    for _, row in df.iterrows():
        features = tuple(row[common_features])
        matching_row = distribution.loc[(distribution[common_features] == features).all(axis=1)]
        
        if not matching_row.empty:
            prob_yes = matching_row['y'].values[0]
            sample = np.random.choice([0, 1], p=[1 - prob_yes, prob_yes])
        else:
            sample = np.random.choice([0, 1])
        
        synthetic_outcome.append(sample)
    
    return synthetic_outcome

# generating the data
np.random.seed(3101)
original['Duration_of_Contact'] = generate_synthetic_features(ori, duration_db, 'duration')
original['Num_of_Contacts_Made'] = generate_synthetic_features(ori, campaign_db, 'campaign')
original['Last_Contacted'] = generate_synthetic_features(ori, pdays_db, 'pdays')
original['Outcome'] = generate_synthetic_outcome(ori, y_db)
original.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Month_with_bank,...,Months_Inactive_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Trans_Amt,Total_Trans_Count,Avg_Utilization_Ratio,Duration_of_Contact,Num_of_Contacts_Made,Last_Contacted,Outcome
0,768805383,Existing Customer,45,M,3,High School,Married,60 - 80,Blue,39,...,1,12691.0,777,1144,42,0.061,73,2,-1,0
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than 40,Blue,44,...,1,8256.0,864,1291,33,0.105,70,1,-1,0
2,713982108,Existing Customer,51,M,3,Graduate,Married,80 - 120,Blue,36,...,1,3418.0,0,1887,20,0.0,206,3,-1,0
3,709106358,Existing Customer,40,M,3,Uneducated,Married,60 - 80,Blue,21,...,1,4716.0,0,816,28,0.0,977,1,-1,0
4,713061558,Existing Customer,44,M,2,Graduate,Married,40 - 60,Blue,36,...,1,4010.0,1247,1088,24,0.311,15,1,9,0


# 2. Exporting to data/processed

In [7]:
features = ['CLIENTNUM', 'Duration_of_Contact', 'Num_of_Contacts_Made','Last_Contacted','Outcome']
final_df = original.loc[:,features]
final_df.to_csv('../data/processed/Campaign.csv', index=False)