In [1]:
import pandas as pd
from pathlib import Path

BASE = Path.cwd().parents[0]
RAW = BASE / "data" / "raw"
PROCESSED = BASE / "data" / "processed"
PROCESSED.mkdir(parents=True, exist_ok=True)

# Load
raw_path = RAW / "programs_detailed_boostcamp_kaggle.csv"
df = pd.read_csv(raw_path, encoding='utf-8', low_memory=False)

# Basic cleaning
# 1) Strip whitespace from string columns
str_cols = df.select_dtypes(include=["object"]).columns
for c in str_cols:
    df[c] = df[c].astype(str).str.strip()

# 2) Normalize list-like columns stored as strings (e.g. "['X','Y']")
import ast

def parse_list_like(val):
    try:
        parsed = ast.literal_eval(val)
        if isinstance(parsed, (list, tuple)):
            return [str(x).strip() for x in parsed]
    except Exception:
        pass
    # fallback: split on comma and remove surrounding quotes
    if isinstance(val, str) and val.startswith("[") and val.endswith("]"):
        inner = val[1:-1]
        items = []
        for x in inner.split(','):
            x = x.strip()
            if not x:
                continue
            if (x[0] == "'" and x[-1] == "'") or (x[0] == '"' and x[-1] == '"'):
                x = x[1:-1]
            items.append(x.strip())
        return items
    return val

for col in ['level','goal']:
    if col in df.columns:
        df[col] = df[col].apply(parse_list_like)

# Convert list-like columns to JSON strings to allow hashing for duplicate removal
import json
for col in ['level','goal']:
    if col in df.columns:
        df[col] = df[col].apply(lambda v: json.dumps(v, ensure_ascii=False) if isinstance(v, (list, tuple)) else v)

# 3) Dates: parse created/last_edit if present
for date_col in ['created','last_edit']:
    if date_col in df.columns:
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')

# 4) Numeric coercion
for ncol in ['program_length','time_per_workout','total_exercises']:
    if ncol in df.columns:
        df[ncol] = pd.to_numeric(df[ncol], errors='coerce')

# 5) Drop exact duplicate rows
df = df.drop_duplicates()

# 6) Remove rows with no title
if 'title' in df.columns:
    df = df[df['title'].notna() & (df['title'].str.len() > 0)]

# 7) Export cleaned CSV
out_path = PROCESSED / 'programs_detailed_cleaned.csv'
df.to_csv(out_path, index=False, encoding='utf-8')

print('Saved cleaned file to', out_path)



Saved cleaned file to /Users/corin44/Documents/myofitness/myo_datas/data/processed/programs_detailed_cleaned.csv
