In [2]:
import numpy as np
import pandas as pd


In [3]:

# Set random seed
np.random.seed(42)

# Number of employees
n = 1000

# Step 1: Generate latent skill (drives both Year 1 and Year 2 outcomes)
skill = np.random.normal(loc=60, scale=20, size=n)
skill = np.clip(skill, 0, 100)  # Ensure skill is between 0 and 100

# Step 2: Generate Year 1 sales
year1_sales = 0.5*skill + np.random.normal(0, 5, n)
year1_sales = np.clip(year1_sales, 0, 50)


# Step 2.5: Determine part-time status
part_time = np.where(
    year1_sales < 30,
    np.random.choice([0, 1], size=n, p=[0.5, 0.5]),
    0
)

# Step 3: Determine who gets training (treatment assignment)
cutoff = 40
treatment = (year1_sales >= cutoff).astype(int)

# Step 4: Generate Year 2 revenue
# Revenue is a function of skill, luck, and a treatment bump (e.g., +10 units)
treatment_effect = 20
year2_sales = (
    0.5 * skill                      # Skill-driven component
    + treatment * treatment_effect
    + np.random.normal(0, 5, n)   # Random noise
)

# However: If year 1 sales are below the probabation cutoff, then year 2 sales drop to half of what they would be  
probation_cutoff = 20
year2_sales = np.where(year1_sales < probation_cutoff, 0.5*year2_sales, year2_sales)

# Create DataFrame
df = pd.DataFrame({
    "EmployeeID": np.arange(1, n+1),
    "Skill": skill,
    "Year1_Sales": year1_sales,
    "Training": treatment,
    "Year2_Sales": year2_sales,
    "Part_Time": part_time
})


# Save the DataFrame to a CSV file
df.to_csv("../data/rd_employee_sales_data.csv", index=False)