In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set seed for reproducibility
np.random.seed(42)

# Number of customers
n_customers = 30000


# Generate demographic data
def generate_demographic_data(n):
    # Age - normal distribution with mean 35 and std 12
    age = np.random.normal(35, 12, n).astype(int)
    age = np.clip(age, 18, 80)

    # Gender
    gender = np.random.choice(['Male', 'Female'], n)

    # Marital Status
    marital_status = np.random.choice(['Single', 'Married'], n, p=[0.4, 0.6])

    # Education
    education = np.random.choice(
        ['High School', 'Bachelors', 'Masters', 'PhD'],
        n,
        p=[0.3, 0.4, 0.2, 0.1]
    )

    # Annual Income (in thousands)
    income = np.exp(np.random.normal(4, 0.5, n)) * 10
    income = income.astype(int)

    return pd.DataFrame({
        'age': age,
        'gender': gender,
        'marital_status': marital_status,
        'education': education,
        'annual_income': income
    })


# Generate purchase data with Poisson distribution
def generate_purchase_data(n):
    # Annual purchase frequency - Poisson with mean 12 (avg one purchase per month)
    purchase_frequency = np.random.poisson(12, n)

    # Average purchase amount (in dollars)
    avg_purchase_amount = np.exp(np.random.normal(5, 0.6, n)) * 10
    avg_purchase_amount = avg_purchase_amount.astype(int)

    # Online purchase ratio
    online_ratio = np.random.beta(5, 5, n)

    # Days since last purchase - Poisson
    last_purchase = np.random.poisson(30, n)  # mean 30 days
    last_purchase_date = datetime.now() - pd.to_timedelta(last_purchase, unit='D')

    # Loyalty score (0-100) based on purchase frequency and recency
    loyalty_score = np.clip(100 - last_purchase * 0.5 + purchase_frequency * 2, 0, 100)

    # Return rate based on Poisson distribution of returns
    returns = np.random.poisson(1, n)  # mean 1 return
    return_rate = (returns / np.maximum(purchase_frequency, 1)) * 100
    return_rate = np.clip(return_rate, 0, 100)

    return pd.DataFrame({
        'annual_purchases': purchase_frequency,
        'avg_purchase_amount': avg_purchase_amount,
        'online_purchase_ratio': online_ratio.round(2),
        'last_purchase_date': last_purchase_date,
        'loyalty_score': loyalty_score.astype(int),
        'return_rate': return_rate.round(1)
    })


# Generate website behavior data with Poisson
def generate_website_behavior(n):
    # Average visit duration (minutes) - Poisson
    avg_visit_duration = np.random.poisson(15, n)  # mean 15 minutes

    # Monthly visits - Poisson
    monthly_visits = np.random.poisson(8, n)  # mean 8 visits per month

    # Ad clicks - Poisson
    ad_clicks = np.random.poisson(3, n)  # mean 3 clicks
    # Click rate relative to visits
    ad_click_rate = (ad_clicks / np.maximum(monthly_visits, 1)) * 100
    ad_click_rate = np.clip(ad_click_rate, 0, 100)

    # Conversions - Poisson
    conversions = np.random.poisson(2, n)  # mean 2 conversions
    # Conversion rate relative to visits
    conversion_rate = (conversions / np.maximum(monthly_visits, 1)) * 100
    conversion_rate = np.clip(conversion_rate, 0, 100)

    return pd.DataFrame({
        'avg_visit_duration': avg_visit_duration,
        'monthly_visits': monthly_visits,
        'conversion_rate': conversion_rate.round(1),
        'ad_click_rate': ad_click_rate.round(1)
    })


# Generate all data
demographic_data = generate_demographic_data(n_customers)
purchase_data = generate_purchase_data(n_customers)
website_data = generate_website_behavior(n_customers)

# Create customer IDs
customer_id = pd.DataFrame({
    'customer_id': [f'CUST_{str(i).zfill(5)}' for i in range(n_customers)]
})

# Combine all data
final_df = pd.concat([
    customer_id,
    demographic_data,
    purchase_data,
    website_data
], axis=1)

# Save to CSV
final_df.to_csv('customer_data_poisson.csv', index=False)

# Display summary statistics
print("\nDataset Summary Statistics:")
print("\nTotal customers:", len(final_df))
print("\nAnnual Purchases Statistics:")
print(final_df['annual_purchases'].describe().round(1))
print("\nMonthly Visits Statistics:")
print(final_df['monthly_visits'].describe().round(1))
print("\nAverage Visit Duration Statistics:")
print(final_df['avg_visit_duration'].describe().round(1))


Dataset Summary Statistics:

Total customers: 30000

Annual Purchases Statistics:
count    30000.0
mean        12.0
std          3.5
min          1.0
25%         10.0
50%         12.0
75%         14.0
max         31.0
Name: annual_purchases, dtype: float64

Monthly Visits Statistics:
count    30000.0
mean         8.0
std          2.8
min          0.0
25%          6.0
50%          8.0
75%         10.0
max         21.0
Name: monthly_visits, dtype: float64

Average Visit Duration Statistics:
count    30000.0
mean        15.0
std          3.9
min          2.0
25%         12.0
50%         15.0
75%         18.0
max         33.0
Name: avg_visit_duration, dtype: float64
