# 🎯 Synthetic Influencer Marketing Data Generator

**📚 Introduction**
This Jupyter Notebook script creates four realistic CSV datasets—`influencers.csv`, `posts.csv`, `tracking_data.csv`, and `payouts.csv`—to simulate an end-to-end influencer marketing campaign.

- **Realistic Distributions**: Uneven campaign weights and platform splits
- **Performance Mix**: 90% high performers (ROAS 4×–6×) + 10% loss-makers (ROAS 0.5×–0.9×)
- **Scaled Volumes**: Realistic scale of impressions, orders, and revenue for a lean dataset
- **Timeframe**: March 1 → June 30, 2025

**🚀 Purpose & Workflow**
1. **Configure** core parameters (counts, date range, weights)  
2. **Generate** influencer master table  
3. **Create** posts table (reach & engagement)  
4. **Simulate** tracking events (orders & revenue) with 10% sampling  
5. **Calculate** payouts to match target ROAS ranges  
6. **Export** as CSVs for downstream analytics

In [None]:
# 1️⃣ SETUP: Imports & Random Seeds
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker and seeds for reproducibility
fake = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# 2️⃣ CONFIGURATION: Parameters & Performance Mix
# Keep original counts but scale numeric volumes
n_influencers = 100  # total influencers
n_posts       = 1000  # total posts
dates          = ("2025-03-01", "2025-06-30")  # range for posts & orders

# Campaigns & their realistic uneven weights
campaigns, campaign_wts = ['HKFitBlast','MBProteinPush','GritzoKidsRun'], [0.50, 0.30, 0.20]
# Product catalog
products = ['Whey Protein','Multivitamin','Creatine','Omega 3']
# Influencer attributes
platforms = ['Instagram','YouTube','Twitter']
categories = ['Fitness','Lifestyle','Beauty','Tech']
genders = ['Male','Female','Other']
tiers = ['Micro','Macro','Celebrity']

# 10% of influencers randomly chosen as loss-makers 🎭
poor_ids = set(random.sample(range(1, n_influencers+1), int(n_influencers*0.10)))

# Convert date strings to pandas timestamps\start_date, end_date = pd.to_datetime(dates[0]), pd.to_datetime(dates[1])

# 3️⃣ INFLUENCER MASTER TABLE
# Columns: ID, name, category, gender, follower_count, platform, tier
influencers = pd.DataFrame({
    'ID': range(1, n_influencers+1),
    'name': [fake.name() for _ in range(n_influencers)],
    'category': np.random.choice(categories, n_influencers),
    'gender': np.random.choice(genders, n_influencers),
    'follower_count': np.random.randint(10_000, 1_000_000, n_influencers),
    'platform': np.random.choice(platforms, n_influencers),
    'tier': np.random.choice(tiers, n_influencers),
})

# 4️⃣ POSTS TABLE (Reach & Engagement)
# Each row is an influencer post with scaled-down metrics
posts = pd.DataFrame({
    'influencer_id': np.random.choice(influencers['ID'], n_posts),
    'platform': np.random.choice(platforms, n_posts),
    'date': [fake.date_between(start_date, end_date) for _ in range(n_posts)],
    'URL': [fake.url() for _ in range(n_posts)],
    'caption': [fake.sentence(nb_words=8) for _ in range(n_posts)],
    # Scaled-down by ~1/10th
    'reach': np.random.randint(1_500, 10_000, n_posts),
    'likes': np.random.randint(10, 500, n_posts),
    'comments': np.random.randint(1, 20, n_posts),
})

# Compute each influencer’s total reach for conversion sim
total_reach = posts.groupby('influencer_id')['reach'].sum().reset_index()
total_reach.columns = ['influencer_id','total_reach']

# 5️⃣ TRACKING DATA (Orders & Revenue)
# Simulate 1.5%–5% conversion, then sample 10% of orders
tracking = []
sample_frac = 0.10

for _, row in total_reach.iterrows():
    inf_id, reach_val = row['influencer_id'], row['total_reach']
    conv_rate = np.random.uniform(0.015, 0.05)
    orders_est = max(1, int(reach_val * conv_rate * sample_frac))

    for _ in range(orders_est):
        camp = random.choices(campaigns, weights=campaign_wts)[0]
        prod = random.choice(products)
        base_price = np.random.uniform(800,3000) * 0.1
        multiplier = {'Whey Protein':1.2,'Creatine':1.1,'Multivitamin':1.0,'Omega 3':0.9}[prod]
        rev = round(base_price * multiplier, 2)

        tracking.append({
            'source': random.choice(['SwipeUp','BioLink','PromoCode']),
            'campaign': camp,
            'influencer_id': inf_id,
            'user_id': fake.uuid4(),
            'product': prod,
            'date': fake.date_between(start_date, end_date),
            'orders': 1,
            'revenue': rev
        })

tracking_data = pd.DataFrame(tracking)

# 6️⃣ PAYOUTS (Cost Calculation)
payouts = []
for inf in influencers['ID']:
    df_t = tracking_data[tracking_data['influencer_id']==inf]
    rev_sum, ord_count = df_t['revenue'].sum(), df_t.shape[0]
    post_count = posts['influencer_id'].eq(inf).sum()

    # Target ROAS based on performance tier
    roas = np.random.uniform(0.5,0.9) if inf in poor_ids else np.random.uniform(4,6)
    cost = rev_sum/roas if ord_count else 0

    # Choose basis and compute rate
    basis = random.choice(['order','post'])
    rate = (cost/ord_count) if (basis=='order' and ord_count) else (cost/max(1,post_count))

    payouts.append({'influencer_id':inf,'basis':basis,'rate':round(rate,2),'orders':ord_count,'total_payout':round(cost,2)})

payouts = pd.DataFrame(payouts)

# 7️⃣ EXPORT CSVs
influencers.to_csv('influencers.csv', index=False)
posts.to_csv('posts.csv', index=False)
tracking_data.to_csv('tracking_data.csv', index=False)
payouts.to_csv('payouts.csv', index=False)

print('✅ All CSVs generated successfully! 🎉')