# 1. Business Understanding

1. What relevant key metrics are provided to evaluate the CTA combinations? And which CTA Copy and CTA Placement did best/worst based on the key metrics? - The main metric provided to evaluate the CTA combinations is click through rate (CTR). This is because the higher the CTR, the more likely the user will click on the CTA and visit the website, which means that this would allow us to evaluate the CTA combinations. Other key metrics are submittedForm, scheduledAppointment, and revenue as these also allow us to evaluate the CTA combinations in terms of what types of clicks happen.

In [None]:
import pandas as pd
import numpy as np

# Load the data
train_df = pd.read_csv('train.csv')

# Compute metrics for each CTA combination
metrics = train_df.groupby(['ctaCopy', 'ctaPlacement']).agg({
    'clickedCTA': 'mean',  # CTR
    'submittedForm': 'mean',  # Appointment rate (assuming submittedForm indicates appointment)
    'revenue': ['sum', 'mean']  # Total revenue and mean revenue
}).reset_index()

# Calculate additional metrics
metrics.columns = ['ctaCopy', 'ctaPlacement', 'CTR', 'Appointment_Rate', 'Total_Revenue', 'Mean_Revenue']

# Calculate Revenue per Impression (total revenue / number of impressions)
impressions = train_df.groupby(['ctaCopy', 'ctaPlacement']).size().reset_index(name='Impressions')
metrics = metrics.merge(impressions, on=['ctaCopy', 'ctaPlacement'])
metrics['Revenue_per_Impression'] = metrics['Total_Revenue'] / metrics['Impressions']

# Calculate Revenue per Click (total revenue / number of clicks)
clicks = train_df[train_df['clickedCTA'] == 1].groupby(['ctaCopy', 'ctaPlacement']).size().reset_index(name='Clicks')
metrics = metrics.merge(clicks, on=['ctaCopy', 'ctaPlacement'], how='left')
metrics['Clicks'] = metrics['Clicks'].fillna(0)
metrics['Revenue_per_Click'] = metrics['Total_Revenue'] / metrics['Clicks'].replace(0, np.nan)

# Display the metrics
print("Metrics for each CTA combination:")
print("=" * 80)
print(metrics[['ctaCopy', 'ctaPlacement', 'CTR', 'Appointment_Rate', 
               'Revenue_per_Impression', 'Revenue_per_Click']].to_string(index=False))
print("\n")

# Identify best and worst performing combinations
print("Best-performing combination(s):")
print("=" * 80)
best_ctr = metrics.loc[metrics['CTR'].idxmax()]
best_appt = metrics.loc[metrics['Appointment_Rate'].idxmax()]
best_rev_imp = metrics.loc[metrics['Revenue_per_Impression'].idxmax()]
best_rev_click = metrics.loc[metrics['Revenue_per_Click'].idxmax()]

print(f"Highest CTR: {best_ctr['ctaCopy']} - {best_ctr['ctaPlacement']} (CTR: {best_ctr['CTR']:.4f})")
print(f"Highest Appointment Rate: {best_appt['ctaCopy']} - {best_appt['ctaPlacement']} (Rate: {best_appt['Appointment_Rate']:.4f})")
print(f"Highest Revenue per Impression: {best_rev_imp['ctaCopy']} - {best_rev_imp['ctaPlacement']} (Revenue: ${best_rev_imp['Revenue_per_Impression']:.2f})")
print(f"Highest Revenue per Click: {best_rev_click['ctaCopy']} - {best_rev_click['ctaPlacement']} (Revenue: ${best_rev_click['Revenue_per_Click']:.2f})")

print("\n")
print("Worst-performing combination(s):")
print("=" * 80)
worst_ctr = metrics.loc[metrics['CTR'].idxmin()]
worst_appt = metrics.loc[metrics['Appointment_Rate'].idxmin()]
worst_rev_imp = metrics.loc[metrics['Revenue_per_Impression'].idxmin()]
worst_rev_click = metrics.loc[metrics['Revenue_per_Click'].idxmin()]

print(f"Lowest CTR: {worst_ctr['ctaCopy']} - {worst_ctr['ctaPlacement']} (CTR: {worst_ctr['CTR']:.4f})")
print(f"Lowest Appointment Rate: {worst_appt['ctaCopy']} - {worst_appt['ctaPlacement']} (Rate: {worst_appt['Appointment_Rate']:.4f})")
print(f"Lowest Revenue per Impression: {worst_rev_imp['ctaCopy']} - {worst_rev_imp['ctaPlacement']} (Revenue: ${worst_rev_imp['Revenue_per_Impression']:.2f})")
print(f"Lowest Revenue per Click: {worst_rev_click['ctaCopy']} - {worst_rev_click['ctaPlacement']} (Revenue: ${worst_rev_click['Revenue_per_Click']:.2f})")

2. Which groups of people tend to be more correlated or less correlated with our key metrics?

3. What ways can you manipulate the columns/dataset to create features that increase predictive power towards our key metric?

4. Besides Log Loss, what other metrics will you use to evaluate the model's performance, and why?

# 2. Exploratory Data Analysis

# 3. Baseline Model

# 4. Iteration 1: Feature Engineering

# 5. Iteration 2: Model Improvement

# 6. Final Model Selection

# 7. Test Predictions