In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import sys

proj_root = Path.cwd().parent  # go up to stage03_python-fundamentals
DATA_DIR = proj_root / "data"
PROC_DIR = DATA_DIR / "processed"

# Make src importable
sys.path.append(str(proj_root / "src"))
from utils import ensure_dir, get_summary_stats, groupby_mean, save_table, save_histogram

ensure_dir(PROC_DIR)
print("Project root:", proj_root)
print("Data dir:", DATA_DIR)
print("Processed dir:", PROC_DIR)

Project root: c:\Users\Tracy\bootcamp_yuning_wang\homework\stage03_python-fundamentals
Data dir: c:\Users\Tracy\bootcamp_yuning_wang\homework\stage03_python-fundamentals\data
Processed dir: c:\Users\Tracy\bootcamp_yuning_wang\homework\stage03_python-fundamentals\data\processed


In [3]:
# Create array and elementwise operations
arr = np.arange(10)        
arr_double = arr * 2      
arr_squared = arr ** 2
arr, arr_double, arr_squared

# Compare loop vs vectorized on a big array
big = np.arange(1_000_000)

# Python list comprehension (loop)
%timeit [x * 2 for x in big]

# NumPy vectorized
%timeit big * 2

126 ms ± 2.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
2.39 ms ± 59.8 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [4]:
csv_path = DATA_DIR / "starter_data.csv"
df = pd.read_csv(csv_path)

display(df.head())  
df.info()          

Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  10 non-null     object
 1   value     10 non-null     int64 
 2   date      10 non-null     object
dtypes: int64(1), object(2)
memory usage: 372.0+ bytes


In [7]:
def get_summary_stats(df: pd.DataFrame) -> pd.DataFrame:
    """Return numeric summary stats (count, mean, std, min, quartiles, max)."""
    try:
        # For newer pandas versions (>=1.5)
        return df.describe(numeric_only=True)
    except TypeError:
        # For older pandas versions
        return df.select_dtypes(include="number").describe()

summary_stats = get_summary_stats(df)
summary_stats

Unnamed: 0,value
count,10.0
mean,17.6
std,7.381659
min,10.0
25%,12.25
50%,14.5
75%,23.25
max,30.0


In [8]:
by_col = "category" 
summary_by_cat = groupby_mean(df, by=by_col)
summary_by_cat

Unnamed: 0,category,value
0,A,11.5
1,B,15.666667
2,C,27.666667


In [9]:
# Save tables
save_table(summary_by_cat, PROC_DIR / "summary.csv", PROC_DIR / "summary.json")

# Bonus: histogram of a numeric column
num_col = "value"  # <-- change if needed
if num_col in df.columns:
    save_histogram(df[num_col], PROC_DIR / "histogram.png", title=f"{num_col} distribution")
    print("Saved plot:", PROC_DIR / "histogram.png")
else:
    print(f"Column '{num_col}' not found; skipping histogram.")

Saved plot: c:\Users\Tracy\bootcamp_yuning_wang\homework\stage03_python-fundamentals\data\processed\histogram.png


In [10]:
from datetime import datetime

def log_call(func):
    def wrapper(*args, **kwargs):
        print(f"{func.__name__} called at {datetime.now()}")
        return func(*args, **kwargs)
    return wrapper

@log_call
def calc_mean_std(values):
    a = np.array(values)
    return a.mean(), a.std()

calc_mean_std([1,2,3,4,5])

calc_mean_std called at 2025-08-20 09:12:08.413934


(np.float64(3.0), np.float64(1.4142135623730951))