In [7]:
import pandas as pd
import numpy as np

# -------------------------------------------------
# CONFIG
# -------------------------------------------------
SOURCE_FILE = "fraunhofer_dashboard_data.csv"
OUTPUT_FILE = "courses_timeseries.csv"

# Kh√≥a c√≥ anomaly m·∫°nh ƒë·ªÉ demo
ANOMALY_COURSE = "The Complete SQL Bootcamp 2020: Go from Zero to Hero"

# -------------------------------------------------
# LOAD SNAPSHOT DATA
# -------------------------------------------------
df = pd.read_csv(SOURCE_FILE)

# -------------------------------------------------
# Generate last 12 months ending in THIS MONTH
# -------------------------------------------------

today = pd.Timestamp.today()
end_month = today.replace(day=1) + pd.offsets.MonthEnd(0)    # ALWAYS month-end
months = pd.date_range(end=end_month, periods=12, freq="M")

synthetic_rows = []

for _, row in df.iterrows():
    cid = row["course_id"]
    cname = row["course_name"]
    total_part = row["participants"]
    price = row["price"]
    base_rating = row["satisfaction"]

    # -------------------------------------------------
    # 1. Base smooth randomness
    # -------------------------------------------------
    base_dist = np.abs(np.random.normal(1, 0.12, 12))

    # -------------------------------------------------
    # 2. Trend (slight up/down)
    # -------------------------------------------------
    trend_factor = np.linspace(
        1,
        np.random.uniform(0.92, 1.10),
        12
    )

    # -------------------------------------------------
    # 3. Seasonality
    # -------------------------------------------------
    seasonality = np.array([
        1.10, 1.05, 1.00, 0.95, 0.90,
        0.85, 0.90, 0.95, 1.00,
        1.08, 1.15, 1.25
    ])

    signal = base_dist * trend_factor * seasonality
    signal = signal / signal.sum()

    # -------------------------------------------------
    # 4. Participants & Revenue
    # -------------------------------------------------
    monthly_participants = (signal * total_part).astype(int)
    monthly_revenue = monthly_participants * price

    # -------------------------------------------------
    # 5. Satisfaction variation
    # -------------------------------------------------
    monthly_satisfaction = np.clip(
        base_rating + np.random.normal(0, 0.05, 12),
        0, 5
    )

    # -------------------------------------------------
    # 6. Inject anomalies (improved logic)
    # -------------------------------------------------

    # ‚≠ê ALWAYS anomaly in last month (for demo)
    anomaly_factor_last = np.random.uniform(0.30, 1.70)
    monthly_revenue[-1] = int(monthly_revenue[-1] * anomaly_factor_last)
    monthly_participants[-1] = int(monthly_participants[-1] * anomaly_factor_last)

    # Extra strong anomalies for chosen course
    if cname == ANOMALY_COURSE:

        # Strong CRASH on month -2
        monthly_revenue[-2] = int(monthly_revenue[-2] * 0.25)
        monthly_participants[-2] = int(monthly_participants[-2] * 0.25)

        # A random SPIKE earlier
        spike_idx = np.random.randint(0, 10)
        monthly_revenue[spike_idx] = int(monthly_revenue[spike_idx] * 1.8)
        monthly_participants[spike_idx] = int(monthly_participants[spike_idx] * 1.7)

    else:
        # Some courses get minor anomalies
        if np.random.rand() < 0.20:
            idx = np.random.randint(0, 10)
            anomaly_factor = np.random.uniform(0.80, 1.25)
            monthly_revenue[idx] = int(monthly_revenue[idx] * anomaly_factor)
            monthly_participants[idx] = int(monthly_participants[idx] * anomaly_factor)

    # -------------------------------------------------
    # 7. Build rows
    # -------------------------------------------------
    for i in range(12):
        synthetic_rows.append({
            "course_id": cid,
            "course_name": cname,
            "month": months[i],
            "participants_monthly": monthly_participants[i],
            "revenue_monthly": monthly_revenue[i],
            "satisfaction_monthly": monthly_satisfaction[i],
        })

# -------------------------------------------------
# EXPORT
# -------------------------------------------------
ts_df = pd.DataFrame(synthetic_rows)
ts_df.to_csv(OUTPUT_FILE, index=False)

print("üéâ Synthetic time series generated SUCCESSFULLY!")
print(ts_df.tail())


  months = pd.date_range(end=end_month, periods=12, freq="M")


üéâ Synthetic time series generated SUCCESSFULLY!
        course_id                                 course_name  \
163279    2935720  Acabou a Previd√™ncia e agora? -  Volume 03   
163280    2935720  Acabou a Previd√™ncia e agora? -  Volume 03   
163281    2935720  Acabou a Previd√™ncia e agora? -  Volume 03   
163282    2935720  Acabou a Previd√™ncia e agora? -  Volume 03   
163283    2935720  Acabou a Previd√™ncia e agora? -  Volume 03   

                            month  participants_monthly  revenue_monthly  \
163279 2025-07-31 22:22:19.124674                     0              0.0   
163280 2025-08-31 22:22:19.124674                     0              0.0   
163281 2025-09-30 22:22:19.124674                     0              0.0   
163282 2025-10-31 22:22:19.124674                     0              0.0   
163283 2025-11-30 22:22:19.124674                     0              0.0   

        satisfaction_monthly  
163279              0.000000  
163280              0.117840  
163