In [1]:
# Cell 1 — Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import yaml
import os

# ------------------------------
# Load params.yaml
with open("params.yaml") as f:
    params = yaml.safe_load(f)

RAW_DATA_PATH = params["data"].get("raw", "data/raw/bank-full.csv")  # default if not in params
PROCESSED_PATH = params["data"].get("processed", "data/processed/")

# Ensure processed directory exists
os.makedirs(PROCESSED_PATH, exist_ok=True)

FileNotFoundError: [Errno 2] No such file or directory: 'params.yaml'

In [None]:
# Cell 2 — Functions

def load_data(path=RAW_DATA_PATH):
    """Load raw dataset"""
    df = pd.read_csv(path, sep=";")  # UCI dataset is semicolon-separated
    return df

def clean_data(df):
    """Basic cleaning + encoding"""
    # Drop duplicates
    df = df.drop_duplicates()

    # Encode categorical features
    categorical_cols = df.select_dtypes(include=["object"]).columns
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

    return df, label_encoders

def split_data(df, strat_col="y"):
    """Split into train, validation, test"""
    train, temp = train_test_split(df, test_size=0.3, random_state=42, stratify=df[strat_col])
    val, test = train_test_split(temp, test_size=0.5, random_state=42, stratify=temp[strat_col])
    return train, val, test

def save_splits(train, val, test, out_dir=PROCESSED_PATH):
    train.to_csv(os.path.join(out_dir, "train.csv"), index=False)
    val.to_csv(os.path.join(out_dir, "val.csv"), index=False)
    test.to_csv(os.path.join(out_dir, "test.csv"), index=False)
    print(f"✅ Data saved in {out_dir}")


In [None]:
# Cell 3 — Run preprocessing

df = load_data()
df_clean, encoders = clean_data(df)
train, val, test = split_data(df_clean)
save_splits(train, val, test)