# Homework Starter — Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [None]:
import sys, os
PROJECT_ROOT = os.path.abspath("..")  
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
print("Project root added to sys.path:", PROJECT_ROOT)

%load_ext autoreload
%autoreload 2


Project root added to sys.path: /Users/wenshan/Downloads/homework3


In [3]:
import pandas as pd
from src.cleaning import fill_missing_median, drop_missing, normalize_data


In [4]:
import os

print("CWD:", os.getcwd())
print("Has ../src ? ->", os.path.isdir(os.path.abspath("../src")))
print("Has ../src/cleaning.py ? ->", os.path.isfile(os.path.abspath("../src/cleaning.py")))


CWD: /Users/wenshan/Downloads/homework3/notebooks
Has ../src ? -> True
Has ../src/cleaning.py ? -> True


In [5]:
import pandas as pd
from src.cleaning import fill_missing_median

df = pd.DataFrame({"a":[1, None, 3], "b":[10, None, 30]})
print("Before:\n", df)
df_clean = fill_missing_median(df)
print("After:\n", df_clean)


Before:
      a     b
0  1.0  10.0
1  NaN   NaN
2  3.0  30.0
After:
      a     b
0  1.0  10.0
1  2.0  20.0
2  3.0  30.0


In [6]:
from pathlib import Path

RAW_DIR = Path("../data/raw")
PROC_DIR = Path("../data/processed")
PROC_DIR.mkdir(parents=True, exist_ok=True)

print("RAW_DIR exists:", RAW_DIR.exists(), "| PROC_DIR exists:", PROC_DIR.exists())


RAW_DIR exists: True | PROC_DIR exists: True


## Load Raw Dataset

## Apply Cleaning Functions

In [None]:
import pandas as pd

raw_files = sorted(RAW_DIR.glob("*.csv"))
assert raw_files, "No CSV files found in ../data/raw/"
raw_path = raw_files[0]
print("Reading:", raw_path.name)
df_raw = pd.read_csv(raw_path)

df_raw.head()


Reading: sample_data.csv


Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


In [None]:
import numpy as np
import pandas as pd

baseline_shape = df_raw.shape
baseline_na = df_raw.isna().sum().sort_values(ascending=False)

num_desc = df_raw.select_dtypes(include=[np.number]).describe()

obj_desc = df_raw.select_dtypes(include=["object"]).describe()

from pandas.api import types as pdt
dt_cols = [c for c in df_raw.columns if pdt.is_datetime64_any_dtype(df_raw[c])]
dt = df_raw[dt_cols]

if dt_cols:
    dt_summary = pd.DataFrame({
        "count": dt.notna().sum(),
        "min": dt.min(numeric_only=False),   # allow datetime min/max
        "max": dt.max(numeric_only=False),
        "nunique": dt.nunique(dropna=True)
    })
else:
    dt_summary = pd.DataFrame()

print("Raw shape:", baseline_shape)
display(baseline_na.head(10).to_frame("raw_NA"))

print("\n--- Numeric describe() ---")
display(num_desc)

print("\n--- Object describe() ---")
display(obj_desc)

if not dt_summary.empty:
    print("\n--- Datetime summary ---")
    display(dt_summary)
else:
    print("\n(No datetime columns detected.)")


Raw shape: (10, 3)


Unnamed: 0,raw_NA
category,0
value,0
date,0



--- Numeric describe() ---


Unnamed: 0,value
count,10.0
mean,17.6
std,7.381659
min,10.0
25%,12.25
50%,14.5
75%,23.25
max,30.0



--- Object describe() ---


Unnamed: 0,category,date
count,10,10
unique,3,10
top,A,2025-08-01
freq,4,1



(No datetime columns detected.)


In [None]:
from src.cleaning import fill_missing_median, drop_missing, normalize_data

# ---- 1) Impute numeric NaNs ----
group_by_cols = None  # e.g., ["region"] or ["store_id"]; otherwise leave as None
df1 = fill_missing_median(df_raw, by=group_by_cols)

# ---- 2) Drop rows that still miss MUST-have fields ----
critical_cols = []  # e.g., ["id", "timestamp"]
df2 = drop_missing(df1, how="any", subset=critical_cols) if critical_cols else df1

# ---- 3) Choose which numeric columns to scale (avoid IDs/codes) ----
import numpy as np

num_cols = df2.select_dtypes(include=[np.number]).columns.tolist()

avoid_tokens = ("id", "zip", "code", "ssn")
avoid_cols = [c for c in num_cols if any(tok in c.lower() for tok in avoid_tokens)]

low_card_cols = [c for c in num_cols if df2[c].nunique(dropna=True) <= 5]

scale_cols = sorted(set(num_cols) - set(avoid_cols) - set(low_card_cols))

print("Numeric columns:", num_cols)
print("Avoid scaling:", sorted(set(avoid_cols + low_card_cols)))
print("Will scale:", scale_cols)

# ---- 4) Normalize (z-score) with gentle winsorization to reduce outlier impact ----
df_clean = normalize_data(df2, columns=scale_cols, method="zscore", clip_outliers=(0.01, 0.99))

df_clean.head()


Numeric columns: ['value']
Avoid scaling: []
Will scale: ['value']


Unnamed: 0,category,value,date
0,A,-1.077517,2025-08-01
1,B,-0.372196,2025-08-02
2,A,-0.803146,2025-08-03
3,B,0.058753,2025-08-04
4,C,1.064301,2025-08-05


In [13]:
from pathlib import Path

PROC_DIR = Path("../data/processed")
PROC_DIR.mkdir(parents=True, exist_ok=True)

out_path = PROC_DIR / f"{raw_path.stem}__cleaned.csv"
df_clean.to_csv(out_path, index=False)
print("✅ Saved cleaned data to:", out_path)


✅ Saved cleaned data to: ../data/processed/sample_data__cleaned.csv


In [14]:
import pandas as pd

clean_shape = df_clean.shape
clean_na = df_clean.isna().sum().sort_values(ascending=False)

print("Shapes: raw -> clean:", baseline_shape, "->", clean_shape)

na_compare = pd.concat(
    [baseline_na.rename("raw_NA"), clean_na.rename("clean_NA")],
    axis=1
).fillna(0).astype(int).sort_values("clean_NA", ascending=False)
display(na_compare.head(15))

print("\n--- Cleaned: numeric describe() ---")
display(df_clean.select_dtypes("number").describe())

print("\nColumns scaled:", scale_cols)


Shapes: raw -> clean: (10, 3) -> (10, 3)


Unnamed: 0,raw_NA,clean_NA
category,0,0
value,0,0
date,0,0



--- Cleaned: numeric describe() ---


Unnamed: 0,value
count,10.0
mean,-1.665335e-16
std,1.054093
min,-1.077517
25%,-0.7672333
50%,-0.4440214
75%,0.8129139
max,1.756693



Columns scaled: ['value']


## Save Cleaned Dataset

In [None]:
# df.to_csv('../data/processed/sample_data_cleaned.csv', index=False)