In [3]:
# ==========================================================
# ðŸ§ª Notebook: 04_generate_test_data.ipynb
# Purpose: Generate CSV test files matching pipeline columns
# ==========================================================

import pandas as pd
import numpy as np
import os, joblib

# ---- Load the exact training column order ----
cols = joblib.load('../notebooks/models/base_input_columns.pkl')
print(f"âœ… Loaded {len(cols)} columns")

# ---- Ensure dashboard folder exists ----
os.makedirs('../dashboard', exist_ok=True)

# ---- Generator function ----
def generate_test_data(n_rows=10, seed=42):
    np.random.seed(seed)
    df = pd.DataFrame(index=range(n_rows), columns=cols)
    for c in cols:
        # simple logic for synthetic values
        if 'ID' in c:
            df[c] = np.arange(1, n_rows + 1)
        elif 'Budget' in c:
            df[c] = np.random.randint(5000, 200000, n_rows)
        elif 'Month' in c:
            df[c] = np.random.randint(1, 12, n_rows)
        elif 'Count' in c or 'Level' in c or 'Index' in c or 'Score' in c:
            df[c] = np.random.randint(1, 10, n_rows)
        elif 'Complexity' in c or 'Volatility' in c or 'Risk' in c:
            df[c] = np.random.randint(1, 10, n_rows)
        else:
            df[c] = np.random.choice(['Low', 'Medium', 'High'], n_rows)
    return df

# ---- Generate and save multiple CSVs ----
for i in range(1, 6):
    df_sample = generate_test_data(n_rows=10 * i, seed=100 + i)
    file_name = f'../dashboard/test_projects_{i}.csv'
    df_sample.to_csv(file_name, index=False)
    print(f"âœ… Saved {file_name} ({len(df_sample)} rows)")

print("\nðŸŽ¯ All CSVs created with correct structure.")


âœ… Loaded 55 columns
âœ… Saved ../dashboard/test_projects_1.csv (10 rows)
âœ… Saved ../dashboard/test_projects_2.csv (20 rows)
âœ… Saved ../dashboard/test_projects_3.csv (30 rows)
âœ… Saved ../dashboard/test_projects_4.csv (40 rows)
âœ… Saved ../dashboard/test_projects_5.csv (50 rows)

ðŸŽ¯ All CSVs created with correct structure.
