In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

In [2]:
NUM_WORKERS = 50
TOTAL_TASKS = 6000
START_DATE = pd.to_datetime("2024-01-01")

In [3]:
workers = [f"W{str(i).zfill(3)}" for i in range(1, NUM_WORKERS + 1)]
task_complexities = ["Low", "Medium", "High"]

In [4]:
# Worker availability (some work more than others)
worker_weights = np.random.dirichlet(np.ones(NUM_WORKERS), size=1)[0]

# Complexity-based parameters
complexity_time = {
    "Low": (4, 1),
    "Medium": (7, 1.5),
    "High": (12, 2)
}

complexity_accuracy = {
    "Low": 0.97,
    "Medium": 0.93,
    "High": 0.88
}

# Worker skill variation
worker_skill = {
    w: np.random.uniform(0.85, 1.1) for w in workers
}

In [5]:
# Track experience per worker (for learning curve)
worker_task_count = {w: 0 for w in workers}

# -----------------------------
# Generate Tasks
# -----------------------------
rows = []

for task_num in range(1, TOTAL_TASKS + 1):
    task_id = f"T{task_num:06d}"

    # Assign worker probabilistically
    worker = np.random.choice(workers, p=worker_weights)
    worker_task_count[worker] += 1

    task_date = START_DATE + pd.Timedelta(days=np.random.randint(0, 180))

    complexity = np.random.choice(
        task_complexities, p=[0.4, 0.4, 0.2]
    )

    # Learning curve effect
    experience = worker_task_count[worker]
    learning_factor = 1 / (1 + np.log1p(experience))

    base_time, time_std = complexity_time[complexity]
    time_taken = np.random.normal(
        base_time * worker_skill[worker] * (1 + learning_factor),
        time_std
    )

    base_accuracy = complexity_accuracy[complexity]
    accuracy = np.clip(
        np.random.normal(
            base_accuracy + (0.05 * (1 - learning_factor)),
            0.02
        ),
        0.75,
        1.0
    )

    rows.append([
        task_id,
        worker,
        task_date,
        complexity,
        round(max(time_taken, 1), 2),
        round(accuracy, 3)
    ])

In [6]:
df = pd.DataFrame(
    rows,
    columns=[
        "task_id",
        "worker_id",
        "task_date",
        "task_complexity",
        "time_taken_minutes",
        "accuracy"
    ]
)

df

Unnamed: 0,task_id,worker_id,task_date,task_complexity,time_taken_minutes,accuracy
0,T000001,W002,2024-02-21,Medium,10.45,0.959
1,T000002,W034,2024-05-22,Medium,8.43,0.940
2,T000003,W046,2024-02-14,Low,6.09,0.971
3,T000004,W010,2024-03-24,High,20.52,0.904
4,T000005,W044,2024-05-08,Low,7.57,0.985
...,...,...,...,...,...,...
5995,T005996,W012,2024-02-26,High,13.87,0.925
5996,T005997,W044,2024-04-24,Low,6.18,1.000
5997,T005998,W026,2024-02-08,Medium,7.17,0.979
5998,T005999,W010,2024-04-12,Medium,7.72,0.980


In [7]:
df.to_csv("simulated_worker_tasks.csv", index=False)