In [1]:
# hw03_python_fundamentals.ipynb
# Homework 3: Python Fundamentals
# Author: Tianze Xia
# Date: 2025-08-20

# ============================
# Step 0: Imports & Setup
# ============================
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Optional: add src/ to path for utils
sys.path.append(os.path.abspath("../src"))

# ============================
# Step 1: NumPy Operations
# ============================

# Create a large NumPy array
arr = np.arange(1, 100001)

# Elementwise operation: square each element
import time

# Loop version
start = time.time()
squared_loop = [x**2 for x in arr]
end = time.time()
print("Loop execution time:", end - start, "seconds")

# Vectorized version
start = time.time()
squared_vec = arr**2
end = time.time()
print("Vectorized execution time:", end - start, "seconds")

# ============================
# Step 2: Dataset Loading
# ============================

# Load provided CSV
data_path = "../data/starter_data.csv"  # adjust path if needed
df = pd.read_csv(data_path)

# Inspect data
print("Data Info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())

# ============================
# Step 3: Summary Statistics
# ============================

# Numeric descriptive statistics
summary_stats = df.describe()
print("\nSummary Statistics:")
print(summary_stats)

# Groupby aggregation (example: group by 'Category' column)
if 'Category' in df.columns:
    grouped_stats = df.groupby('Category').agg({
        'Value1': 'mean',
        'Value2': 'sum'
    })
    print("\nGrouped Statistics by Category:")
    print(grouped_stats)
else:
    print("\nNo 'Category' column found for groupby example.")

# ============================
# Step 4: Save Outputs
# ============================

# Create processed folder if it doesn't exist
os.makedirs("../data/processed", exist_ok=True)

# Save summary stats to CSV
summary_stats.to_csv("../data/processed/summary.csv", index=True)
summary_stats.to_json("../data/processed/summary.json")

# Bonus: simple plot
if 'Value1' in df.columns:
    plt.figure(figsize=(6,4))
    df['Value1'].hist()
    plt.title("Value1 Distribution")
    plt.xlabel("Value1")
    plt.ylabel("Frequency")
    plt.savefig("../data/processed/value1_hist.png")
    plt.show()

# ============================
# Step 5: Reusable Function
# ============================

def get_summary_stats(df):
    """
    Returns descriptive statistics and groupby aggregation by Category (if exists)
    """
    summary = df.describe()
    if 'Category' in df.columns:
        grouped = df.groupby('Category').agg({
            'Value1': 'mean',
            'Value2': 'sum'
        })
    else:
        grouped = None
    return summary, grouped

# Test function
summary, grouped = get_summary_stats(df)
print("\nReusable Function Output:")
print(summary)
if grouped is not None:
    print(grouped)

# ============================
# Optional: Save grouped stats
# ============================
if grouped is not None:
    grouped.to_csv("../data/processed/grouped_stats.csv")

Loop execution time: 0.0037996768951416016 seconds
Vectorized execution time: 0.00029087066650390625 seconds


FileNotFoundError: [Errno 2] No such file or directory: '../data/starter_data.csv'