In [2]:
import sys, os
sys.path.append(os.path.abspath(".."))




In [3]:
# HW06 — Data Preprocessing
import os, glob
import pandas as pd
import numpy as np
from src.config import load_env
from src.cleaning import fill_missing_median, drop_missing, normalize_data

paths = load_env()

# Pick newest AAPL raw file from Stage 04; else fallback to starter
candidates = sorted(glob.glob(os.path.join(paths.raw, "api_yfinance_AAPL_*.csv")))
raw_path = candidates[-1] if candidates else "../data/starter_data.csv"
print("Using raw dataset:", raw_path)

df = pd.read_csv(raw_path)

# Introduce some NA to demonstrate cleaning (safe: only on numeric cols)
for c in df.columns[:2]:
    if str(df[c].dtype).startswith(("float", "int")):
        df.loc[df.index[::15], c] = np.nan

# Apply cleaning functions
num_cols = [c for c in df.columns if str(df[c].dtype).startswith(("float","int"))]
df1 = fill_missing_median(df, num_cols)
df2 = drop_missing(df1, thresh=0.9)
df3 = normalize_data(df2, num_cols)

# Save processed
out_path = os.path.join(paths.processed, "cleaned_dataset.csv")
df3.to_csv(out_path, index=False)
print("Saved cleaned dataset to:", out_path)

# Compare original vs cleaned
print("Original shape:", df.shape)
print("Cleaned shape:", df3.shape)
print("Nulls before:", df.isna().sum().sum(), "after:", df3.isna().sum().sum())


Using raw dataset: data/raw/api_yfinance_AAPL_20250821-0051.csv
Saved cleaned dataset to: data/processed/cleaned_dataset.csv
Original shape: (125, 7)
Cleaned shape: (125, 7)
Nulls before: 9 after: 0
