In [2]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import uuid

# -----------------------------
# Configuration
# -----------------------------
np.random.seed(42)

N_WORKERS = 50
START_DATE = datetime(2024, 1, 1)
N_DAYS = 30

WORKERS = [f"W_{i:03d}" for i in range(1, N_WORKERS + 1)]
DATES = [START_DATE + timedelta(days=i) for i in range(N_DAYS)]

# Task complexity distribution
COMPLEXITIES = ["Low", "Medium", "High"]
COMPLEXITY_PROBS = [0.5, 0.3, 0.2]

# -----------------------------
# Helper functions
# -----------------------------
def generate_task_volume():
    """
    Number of tasks completed by a worker in a day.
    Realistic variability using Poisson distribution.
    """
    return max(1, np.random.poisson(lam=15))  # Avg ~15 tasks/day

def generate_task_metrics(complexity):
    """
    Generate time taken and accuracy based on complexity.
    """
    if complexity == "Low":
        time = np.random.normal(5, 1)
        accuracy = np.random.normal(0.97, 0.02)
    elif complexity == "Medium":
        time = np.random.normal(8, 1.5)
        accuracy = np.random.normal(0.93, 0.03)
    else:
        time = np.random.normal(12, 2)
        accuracy = np.random.normal(0.88, 0.04)

    return max(1, time), min(max(accuracy, 0.6), 1.0)

# -----------------------------
# Data generation
# -----------------------------
records = []

for worker in WORKERS:
    for date in DATES:
        n_tasks = generate_task_volume()

        for _ in range(n_tasks):
            complexity = np.random.choice(COMPLEXITIES, p=COMPLEXITY_PROBS)
            time_taken, accuracy = generate_task_metrics(complexity)

            records.append({
                "task_id": str(uuid.uuid4()),
                "worker_id": worker,
                "task_date": date,
                "task_complexity": complexity,
                "time_taken_minutes": round(time_taken, 2),
                "accuracy": round(accuracy, 3)
            })

# Create DataFrame
df = pd.DataFrame(records)

# Save dataset
df.to_csv("simulated_worker_tasks.csv", index=False)

print("Synthetic dataset generated")
print(df.head())
print(f"Total tasks generated: {len(df)}")


Synthetic dataset generated
                                task_id worker_id  task_date task_complexity  \
0  77016b53-3aa3-497a-b042-4246e5a9583e     W_001 2024-01-01             Low   
1  7f65a185-9675-4d6d-974c-c0b27805270a     W_001 2024-01-01          Medium   
2  a78aa64d-4a37-4105-baab-125b5d293c4e     W_001 2024-01-01             Low   
3  2d08d6c7-c351-40f4-af67-5a62f3a250ce     W_001 2024-01-01          Medium   
4  c236a814-63f0-4035-8e30-b4642282a1d8     W_001 2024-01-01          Medium   

   time_taken_minutes  accuracy  
0                5.28     0.990  
1                7.30     0.946  
2                4.43     0.952  
3                5.41     0.913  
4                7.36     0.908  
Total tasks generated: 22249


In [3]:
df.groupby(["worker_id", "task_date"]).task_id.count()

worker_id  task_date 
W_001      2024-01-01    18
           2024-01-02    17
           2024-01-03    13
           2024-01-04    12
           2024-01-05    21
                         ..
W_050      2024-01-26     9
           2024-01-27    14
           2024-01-28    13
           2024-01-29    22
           2024-01-30    10
Name: task_id, Length: 1500, dtype: int64