### Demographic Synthetic Data Generated for Campaign System

In [1]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

In [None]:

## for simplicity, we simply randomly select 1000 customers and use their recommendation result in subgroup_B qn 2
product = pd.read_csv('../../data/predictions/BQ1.csv')
product = product.sample(4000, random_state=3101)
product = product.drop('Cluster', axis=1)
def get_highest_probability_product(df):
    """
    Function to find the highest probability product for each client.
    
    Parameters:
    df (DataFrame): The input DataFrame containing client IDs and product probabilities.
    
    Returns:
    DataFrame: A DataFrame containing 'CLIENTNUM' and the highest probability 'product' for each client.
    """
    # Create a DataFrame with the highest probability product for each client
    result = pd.DataFrame({
        'CLIENTNUM': df['CLIENTNUM'],
        'product': df.drop(columns='CLIENTNUM').idxmax(axis=1)
    })
    return result
product = get_highest_probability_product(product)
df = product.reset_index(drop = True)
## we generate a campaign dataset for a campaign happens a month
product_to_campaign_type = {
    'credit_cards': 'Consideration',
    'savings_accounts': 'Consideration',
    'investments': 'Conversion',
    'personal_insurance': 'Conversion',
    'commercial_insurance': 'Conversion',
    'personal_loans': 'Conversion',
    'commercial_loans': 'Conversion'
}

# Assign campaign IDs for each campaign type
campaign_type_to_id = {
    'Conversion': 'C1',
    'Retention': 'C2',
    'Consideration': 'C3'
}

# Create a campaign database DataFrame
campaign_data = []
response_weight = {'Conversion': [0.05, 0.10, 0.85],
                  'Retention': [0.15, 0.20, 0.65],
                  'Consideration': [0.10, 0.15, 0.75]}

for index, row in df.iterrows():
    product = row['product']
    clientnum = row['CLIENTNUM']
    campaign_type = product_to_campaign_type[product]
    campaign_id = campaign_type_to_id[campaign_type]
    
    # Different Campaign have different duration
    if campaign_type in ['Conversion', 'Consideration']:
        end_date = datetime.now() + timedelta(days=60)  
    elif campaign_type == 'Consideration':
        end_date = datetime.now() + timedelta(days=30)
    else:
        end_date = datetime.now() 
    

    campaign_entry = {
        'CLIENTNUM': clientnum,
        'Campaign_ID': campaign_id,
        'Campaign_Type': campaign_type,
        'Product': product,
        'Start_Date': datetime.now(),
        'End_Date': end_date,
        'ResponseStatus': random.choices(['Success', 'Rejected', 'Unknown'], weights=response_weight[campaign_type])[0],
        'NumberOfImpressions': random.randint(100, 1000),
        'NumberOfClicks': random.randint(10, 100)
    }
    campaign_data.append(campaign_entry)

campaign_df = pd.DataFrame(campaign_data)
def create_campaign_log(campaign_database):
    """
    Creates a campaign log by grouping the campaign database by CampaignID and calculating
    the total number of impressions, clicks, and conversion rate for each campaign.
    
    Parameters:
    campaign_database (DataFrame): The original campaign database.
    
    Returns:
    DataFrame: A new campaign log DataFrame with aggregated metrics and default settings.
    """
    # Group by CampaignID and aggregate the number of impressions, clicks, and responses
    campaign_log = campaign_database.groupby('Campaign_ID').agg(
        TotalImpressions=('NumberOfImpressions', 'sum'),
        TotalClicks=('NumberOfClicks', 'sum'),
        TotalResponses=('ResponseStatus', lambda x: (x == 'Success').sum())
    ).reset_index()

    # Calculate conversion rate and click-through rate (CTR)
    campaign_log['ConversionRate'] = campaign_log['TotalResponses'] / campaign_log['TotalImpressions']
    campaign_log['ClickThroughRate'] = campaign_log['TotalClicks'] / campaign_log['TotalImpressions']

    # Fill NaN values with 0 for rates where TotalImpressions might be 0
    campaign_log['ConversionRate'] = campaign_log['ConversionRate'].fillna(0)
    campaign_log['ClickThroughRate'] = campaign_log['ClickThroughRate'].fillna(0)

    # Add default channel, frequency, and timing
    nrows = len(campaign_log)
    campaign_log['ChosenChannel'] = ['Email'] * nrows
    campaign_log['ChosenFrequency'] = [1] * nrows
    campaign_log['ChosenTiming'] = [6] * nrows
    
    return campaign_log

campaign_log = create_campaign_log(campaign_df)
campaign_log
# Function to generate synthetic campaign data
def generate_synthetic_campaign_data(num_rows, campaign_type, campaign_id):
    data = []
    for _ in range(num_rows):
        entry = {
            'CLIENTNUM': random.randint(100000000, 999999999),  # Random 9-digit client number
            'Campaign_ID': campaign_id,
            'Campaign_Type': campaign_type,
            'Product': random.choice(['credit_cards', 'savings_accounts', 'personal_loans']),  # Random product example
            'Start_Date': datetime.now(),
            'End_Date': datetime.now() + timedelta(days=90) if campaign_type == 'Consideration' else datetime.now() + timedelta(days=30),
            'ResponseStatus': random.choices(['Success', 'Rejected', 'Unknown'], weights=response_weight[campaign_type])[0],
            'NumberOfImpressions': random.randint(100, 1000),
            'NumberOfClicks': random.randint(10, 100)
        }
        data.append(entry)
    return pd.DataFrame(data)

retention_df = generate_synthetic_campaign_data(2000, 'Retention', 'C2')
consideration_df = generate_synthetic_campaign_data(4000, 'Consideration', 'C3')

# Combine the dataframes if needed
combined_df = pd.concat([retention_df, consideration_df], ignore_index=True)
campaign_df = pd.concat([campaign_df, combined_df], ignore_index=True)
campaign_log = create_campaign_log(campaign_df)
campaign_log
demographic = pd.read_csv("../../data/processed/banking_behaviour_preference.csv")
demographic = demographic.sample(10000, random_state=3101)
### it takes too long to generate transaction data for calculating rfm score, we will simply generate it randomly
# Calculate RFM metrics for synthetic data
demographic['Recency'] = np.random.randint(1, 365, 10000)  # Days since last interaction
demographic['Recency'] = 1 - (demographic['Recency'] / demographic['Recency'].max())  # Normalize and invert
demographic['Frequency'] = demographic['Total_Trans_Count'] / demographic['Total_Trans_Count'].max()  # Normalize
demographic['Monetary'] = demographic['Total_Trans_Amt'] / demographic['Total_Trans_Amt'].max()  # Normalize
# Calculate RFM engagement score
w_R, w_F, w_M = 0.4, 0.3, 0.3
demographic['engagement_score'] = w_R * demographic['Recency'] + w_F * demographic['Frequency'] + w_M * demographic['Monetary']
c1_df = pd.concat([campaign_log[campaign_log['Campaign_ID'] == 'C1']] * 4000, ignore_index=True)
c2_df = pd.concat([campaign_log[campaign_log['Campaign_ID'] == 'C2']] * 2000, ignore_index=True)
c3_df = pd.concat([campaign_log[campaign_log['Campaign_ID'] == 'C3']] * 4000, ignore_index=True)

# Combine the expanded DataFrames
campaign_log = pd.concat([c1_df, c2_df, c3_df], ignore_index=True)
demographic['ClickThroughRate'] = campaign_log['ClickThroughRate'].values
demographic['ConversionRate'] = campaign_log['ConversionRate'].values
demographic['ChosenTiming'] = campaign_log['ChosenTiming'].values
demographic['ChosenFrequency'] = campaign_log['ChosenFrequency'].values
demographic['ChosenChannel'] = campaign_log['ChosenChannel'].values


In [None]:
demographic.to_csv("../../data/processed/demographic.csv", index = False)