### Importing Modules

In [2]:
import sys
from pathlib import Path

project_root = Path("/home/vansh/projects/project-nova").resolve()

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))


import numpy as np
import pandas as pd
from src.data import save_csv
from src.config import DATA_DIR, TARGET_COLUMN
from src.utils import set_seed

### 1. Ensure reproducibility

In [3]:
set_seed(42)

### 2. Number of rows


In [4]:
n_rows = 10_000


### 3. Generate basic features


In [5]:
monthly_earnings = np.random.normal(30000, 8000, n_rows)        # avg 30k ± 8k
trips_per_week = np.random.poisson(30, n_rows)                  # avg 30 trips/week
avg_rating = np.clip(np.random.normal(4.6, 0.3, n_rows), 1, 5)  # 1–5 scale
cancellation_rate = np.clip(np.random.beta(1, 10, n_rows), 0, 1) # 0–1 proportion
active_days_per_month = np.random.randint(5, 31, n_rows)        # 5–30 days

### 4. Create ground-truth Nova Score


In [6]:
#    We'll weight the features: earnings (40%), trips (20%), rating (20%), cancellations (-10%), active_days (10%)
#    Then add some noise to simulate variability.
score_raw = (
    (monthly_earnings / 50000) * 40 +         # normalized earnings to 0–40 points
    (trips_per_week / 60) * 20 +              # normalized trips to 0–20 points
    (avg_rating / 5) * 20 +                   # rating to 0–20 points
    ((1 - cancellation_rate) * 10) +          # fewer cancellations = higher score
    (active_days_per_month / 30) * 10         # active days to 0–10 points
)

# Add Gaussian noise (±5 points)
noise = np.random.normal(0, 5, n_rows)
nova_score = np.clip(score_raw + noise, 0, 100)  # keep between 0 and 100

### 5. Build DataFrame


In [7]:
df = pd.DataFrame({
    "partner_id": [f"P{i+1:05d}" for i in range(n_rows)],
    "monthly_earnings": monthly_earnings.round(2),
    "trips_per_week": trips_per_week,
    "avg_rating": avg_rating.round(2),
    "cancellation_rate": cancellation_rate.round(3),
    "active_days_per_month": active_days_per_month,
    TARGET_COLUMN: nova_score.round(2),
})

### 6. Save to CSV

In [8]:
DATA_DIR.mkdir(parents=True, exist_ok=True)
csv_path = DATA_DIR / "synthetic_basic_10k.csv"
save_csv(df, csv_path)


### 7. Quick check


In [9]:
print(f"Saved dataset to: {csv_path}")
print(f"Shape: {df.shape}")
df.head()

Saved dataset to: /home/vansh/projects/project-nova/data/synthetic_basic_10k.csv
Shape: (10000, 7)


Unnamed: 0,partner_id,monthly_earnings,trips_per_week,avg_rating,cancellation_rate,active_days_per_month,Nova_Score
0,P00001,33973.71,27,4.4,0.05,23,71.01
1,P00002,28893.89,20,4.72,0.052,16,63.45
2,P00003,35181.51,31,4.66,0.069,23,81.22
3,P00004,42184.24,27,4.69,0.022,15,73.58
4,P00005,28126.77,26,4.71,0.019,5,66.21
