# Setup: Generate Sample Dataset

This cell creates the required folder structure (`data/raw/` and `data/processed/`) relative to the notebook, and generates the sample CSV dataset with missing values. 
This ensures the dataset is ready for cleaning functions and saves it to `data/raw/outliers_homework.csv`.

In [1]:
import os
import numpy as np
import pandas as pd

# Define folder paths relative to this notebook
raw_dir = '../data/raw'
processed_dir = '../data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Generate business day dates
dates = pd.date_range(start="2022-01-03", end="2022-06-10", freq="B")

# Fixed random seed for reproducibility
np.random.seed(17)

# Column 1: daily_return ~ N(0, 0.01)
returns = np.random.normal(0, 0.01, size=len(dates))
mask_pre_may = dates < "2022-05-01"
returns[mask_pre_may] -= 0.0015  

# Inject "shock" values
shock_values = {
    "2022-05-02": 0.1748425237194541,
    "2022-05-03": -0.16825801732486943,
    "2022-05-06": -0.19667220757153227,
    "2022-05-09": 0.21240223590614747,
    "2022-05-12": -0.178729287231294
}
for d, v in shock_values.items():
    idx = np.where(dates == pd.to_datetime(d))[0][0]
    returns[idx] = v

# Column 2: daily_return_2, correlated with daily_return + small noise
daily_return_2 = returns * 0.6 + np.random.normal(0, 0.005, size=len(dates))

# Create DataFrame with two numeric columns
df = pd.DataFrame({
    "date": dates,
    "daily_return": returns,
    "daily_return_2": daily_return_2
})

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'outliers_homework.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Synthetic dataset with two columns created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')

File already exists at ../data/raw/outliers_homework.csv. Skipping CSV creation to avoid overwrite.


# Stage 7 Homework — Outliers + Risk Assumptions
In this assignment you will implement outlier detection/handling and run a simple sensitivity analysis.

**Chain:** In the lecture, we learned detection (IQR, Z-score), options for handling (remove/winsorize), and sensitivity testing. Now, you will adapt those methods to a provided dataset and document the risks and assumptions behind your choices.

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
np.random.seed(17)


In [7]:
import pandas as pd, numpy as np
from pathlib import Path
rng = np.random.default_rng(7)  # seeded randomness = reproducible results


## Load Data (provided or synthetic fallback)

In [8]:
data_path = Path("data/raw/outliers_homework.csv")

if data_path.exists():
    df = pd.read_csv(data_path)
else:
    # Synthetic fallback: one numeric column with a few injected outliers
    n = 500
    base = rng.normal(50, 10, n)
    idx = rng.choice(n, 8, replace=False)
    base[idx] = base[idx] + rng.normal(80, 5, idx.size)
    df = pd.DataFrame({"value": base})

df.head(3)


Unnamed: 0,value
0,50.012302
1,52.987455
2,47.258621


## TODO: Implement Outlier Functions (required)

In [None]:
def detect_outliers_iqr(s: pd.Series) -> pd.Series:
    # IQR method: outliers outside [Q1 - 1.5*IQR, Q3 + 1.5*IQR]
    s = pd.to_numeric(s, errors="coerce")
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    return (s < lo) | (s > hi)

def detect_outliers_zscore(s: pd.Series, threshold: float = 3.0) -> pd.Series:
    # Z-score method: |(x - mean)/std| > threshold
    s = pd.to_numeric(s, errors="coerce")
    mu, sigma = s.mean(), s.std(ddof=0)
    if not np.isfinite(sigma) or sigma == 0:
        return pd.Series(False, index=s.index)
    z = (s - mu) / sigma
    return z.abs() > threshold

def winsorize_series(s: pd.Series, lower: float = 0.05, upper: float = 0.95) -> pd.Series:
    # Winsorize: clip extremes to chosen quantiles (keeps row count)
    s = pd.to_numeric(s, errors="coerce")
    lo, hi = s.quantile(lower), s.quantile(upper)
    return s.clip(lower=lo, upper=hi)


## Apply Detection and Create Flags (choose a numeric column)

In [15]:
num_cols = df.select_dtypes(include="number").columns.tolist()
assert len(num_cols) > 0, "No numeric columns found."
col = num_cols[0]  # change if you want a different numeric column
col


'value'

In [16]:
df[f"{col}_outlier_iqr"] = detect_outliers_iqr(df[col])
df[f"{col}_outlier_z3"]  = detect_outliers_zscore(df[col], threshold=3.0)

# quick counts to sanity-check
df[[f"{col}_outlier_iqr", f"{col}_outlier_z3"]].sum()


value_outlier_iqr    10
value_outlier_z3      8
dtype: int64

In [None]:
target_col = 'y' if 'y' in df.columns else df.select_dtypes(include=['number']).columns[0]
df['outlier_iqr'] = detect_outliers_iqr(df[target_col])
df['outlier_z'] = detect_outliers_zscore(df[target_col], threshold=3.0)
df[['outlier_iqr', 'outlier_z']].mean()  # fraction flagged

### Visual Checks (boxplot / histogram)

In [None]:
plt.figure()
plt.boxplot(df[target_col])
plt.title(f'Boxplot: {target_col}')
plt.show()

plt.figure()
plt.hist(df[target_col], bins=30)
plt.title(f'Histogram: {target_col}')
plt.show()

## Sensitivity Analysis
Pick one: summary stats or simple linear regression comparing **all vs. filtered** (and optional winsorized).

In [None]:
# Option A: Summary stats
summ_all = df[target_col].describe()[['mean', '50%', 'std']].rename({'50%': 'median'})
summ_filtered = df.loc[~df['outlier_iqr'], target_col].describe()[['mean', '50%', 'std']].rename({'50%': 'median'})
summ_w = None
if 'winsorize_series' in globals():
    w = winsorize_series(df[target_col])
    summ_w = w.describe()[['mean', '50%', 'std']].rename({'50%': 'median'})

comp = pd.concat(
    {
        'all': summ_all,
        'filtered_iqr': summ_filtered,
        **({'winsorized': summ_w} if summ_w is not None else {})
    }, axis=1
)
comp

In [None]:
# Option B: Simple regression (if x present)
if 'x' in df.columns:
    X_all = df[['x']].to_numpy(); y_all = df[target_col].to_numpy()
    X_filtered = df.loc[~df['outlier_iqr'], ['x']].to_numpy(); y_filtered = df.loc[~df['outlier_iqr'], target_col].to_numpy()

    model_all = LinearRegression().fit(X_all, y_all)
    model_flt = LinearRegression().fit(X_filtered, y_filtered)

    mae_all = mean_absolute_error(y_all, model_all.predict(X_all))
    mae_flt = mean_absolute_error(y_filtered, model_flt.predict(X_filtered))

    results = pd.DataFrame({
        'slope': [model_all.coef_[0], model_flt.coef_[0]],
        'intercept': [model_all.intercept_, model_flt.intercept_],
        'r2': [model_all.score(X_all, y_all), model_flt.score(X_filtered, y_filtered)],
        'mae': [mae_all, mae_flt]
    }, index=['all', 'filtered_iqr'])
    results
else:
    results = None
    print("No 'x' column; skip regression or engineer features.")

### Reflection (≤ 1 page)
- Methods and thresholds used (and why)
- Assumptions behind choices
- Observed impact on results
- Risks if assumptions are wrong (e.g., discarding true events)

*Write your reflection here...*

### Reflection
- IQR is robust for skewed / heavy-tailed distributions, Z-score works better if data ≈ normal.  
- Lower thresholds (k=1.0 or z=2.0) flag more points as outliers, higher thresholds flag fewer.  
- Winsorizing is useful when I want to keep all rows but reduce influence of extremes.  
- Choice of method/threshold depends on data shape and modeling goals:  
  - If I want conservative detection → use Z>3 or IQR k=2.  
  - If I want aggressive detection → use Z>2 or IQR k=1.  
