# Setup: Generate Sample Dataset

This cell creates the required folder structure (`data/raw/` and `data/processed/`) relative to the notebook, and generates the sample CSV dataset with missing values. 
This ensures the dataset is ready for cleaning functions and saves it to `data/raw/sample_data.csv`.

In [6]:
import sys, os
sys.path.append(os.path.abspath(".."))

from src import cleaning

In [7]:
import os
import pandas as pd
import numpy as np

# Define folder paths relative to this notebook
raw_dir = '../data/raw'
processed_dir = '../data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Define the sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')


Sample dataset created and saved to ../data/raw\sample_data.csv


# Homework Starter — Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [8]:
import pandas as pd
from src import cleaning

## Load Raw Dataset

In [9]:
# Load Raw Dataset
df_raw = pd.read_csv('../data/raw/sample_data.csv')
display(df_raw.head())

# Quick NA overview
print("Missing values (raw):")
display(df_raw.isna().sum())

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,


Missing values (raw):


age           1
income        3
score         1
zipcode       0
city          0
extra_data    5
dtype: int64

## Apply Cleaning Functions

In [10]:
# 1) Drop columns that are mostly missing (>=60% non-missing kept)
df = cleaning.drop_missing(df_raw, threshold=0.6, axis='columns')

# 2) Fill numeric NaNs with median
df = cleaning.fill_missing_median(df, cols=None)  # None => all numeric columns

# 3) Optionally drop any remaining sparse rows (require 100% non-missing after fills)
df = cleaning.drop_missing(df, threshold=1.0, axis='rows')

# 4) Normalize selected numeric columns
to_scale = [c for c in ['age', 'income', 'score'] if c in df.columns]
df = cleaning.normalize_data(df, cols=to_scale, method='minmax')

display(df.head())

Unnamed: 0,age,score,zipcode,city
0,0.238095,0.653846,90210,Beverly
1,0.761905,1.0,10001,New York
2,0.0,0.596154,60614,Chicago
3,1.0,0.423077,94103,SF
4,0.428571,0.884615,73301,Austin


In [11]:
print("Shapes: raw -> cleaned:", df_raw.shape, "->", df.shape)

comp = pd.DataFrame({
    'raw_non_null': df_raw.notna().sum(),
    'clean_non_null': df.notna().sum()
}).fillna(0).astype(int)
display(comp)

print("Missing values (cleaned):")
display(df.isna().sum())

Shapes: raw -> cleaned: (7, 6) -> (7, 4)


Unnamed: 0,raw_non_null,clean_non_null
age,6,7
city,7,7
extra_data,2,0
income,4,0
score,6,7
zipcode,7,7


Missing values (cleaned):


age        0
score      0
zipcode    0
city       0
dtype: int64

## Save Cleaned Dataset

In [12]:
out_path = '../data/processed/sample_data_cleaned.csv'
df.to_csv(out_path, index=False)
print(f"Cleaned dataset saved to {out_path}")

Cleaned dataset saved to ../data/processed/sample_data_cleaned.csv
