# Project 2 — Concrete Strength: EDA & Train/Test Split
This notebook performs initial EDA and creates a fixed 70/30 train/test split (seed=598).

**Deliverables produced here:**
- Summary stats, missingness checks, distributions, correlation matrix
- Optional transformation suggestions based on skewness/nonlinearity
- Frozen 70/30 split CSVs saved to `data/`


In [None]:
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.utils_data import load_concrete, train_test_split_fixed

pd.set_option('display.max_columns', 100)
BASE = os.path.abspath('.')
DATA_DIR = os.path.join(os.path.dirname(BASE), 'data') if BASE.endswith('notebooks') else os.path.join(BASE, 'data')
os.makedirs(DATA_DIR, exist_ok=True)


In [None]:
# Load dataset (prefers local data/concrete.csv; will try UCI URL if available)
try:
    df = load_concrete(local_first=True)
except RuntimeError as e:
    print(e)
    # Create an empty placeholder so the notebook remains runnable to the next cells if needed
    df = None

if df is not None:
    print("Shape:", df.shape)
    print(df.head())
    print("\nColumns:", list(df.columns))


In [None]:
# Standardize column names if needed (idempotent)
if df is not None:
    df.columns = [c.strip().lower().replace(' ', '_').replace('(', '').replace(')', '') for c in df.columns]
    # common response name in UCI file: 'concrete_compressive_strength_mpa'
    # ensure it's present; otherwise leave as last column
    print("\nStandardized columns:", list(df.columns))


In [None]:
if df is not None:
    display(df.describe(include='all'))
    print("\nMissing values per column:")
    print(df.isna().sum())


In [None]:
if df is not None:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    skew = df[numeric_cols].skew(numeric_only=True)
    print("Skewness by numeric column:")
    print(skew.sort_values(ascending=False))


In [None]:
# Histograms (one per feature per project rules: matplotlib, no seaborn, single plot per figure)
if df is not None:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        plt.figure()
        plt.hist(df[col].dropna(), bins=30)
        plt.title(f"Histogram: {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.tight_layout()
        plt.show()


In [None]:
# Correlation matrix heatmap (matplotlib only)
if df is not None:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    corr = df[numeric_cols].corr()
    plt.figure()
    im = plt.imshow(corr.values, interpolation='nearest', aspect='auto')
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(numeric_cols)), numeric_cols, rotation=90)
    plt.yticks(range(len(numeric_cols)), numeric_cols)
    plt.title("Correlation matrix (numeric features)")
    plt.tight_layout()
    plt.show()


In [None]:
# Scatter plots: response vs predictors
if df is not None:
    # Identify response column
    y_col = None
    for cand in ["concrete_compressive_strength_mpa", "concrete_compressive_strength", "csmpa", "strength", "concrete_compressive_strength_(mpa)"]:
        if cand in df.columns:
            y_col = cand
            break
    if y_col is None:
        y_col = df.columns[-1]

    predictors = [c for c in df.columns if c != y_col]
    for col in predictors:
        plt.figure()
        plt.scatter(df[col], df[y_col], s=10, alpha=0.7)
        plt.xlabel(col)
        plt.ylabel(y_col)
        plt.title(f"{y_col} vs {col}")
        plt.tight_layout()
        plt.show()


### Transformation suggestions (automated heuristics)
- Consider a **log or log1p** transform for strictly positive variables if:
  1) skewness > 1 **and**
  2) residuals later show heteroscedasticity or strong curvature not captured by modest polynomial/spline df.
- **Age**: do **not** transform by default; consider `log1p(age)` only if skewness > 1 *and* GAM/spline shapes indicate clear diminishing returns that a log captures more parsimoniously.


In [None]:
# Create frozen train/test split (70/30 with seed=598) and save CSVs
if df is not None:
    train_df, test_df = train_test_split_fixed(df, test_size=0.3, seed=598)
    train_path = os.path.join(DATA_DIR, "concrete_train.csv")
    test_path = os.path.join(DATA_DIR, "concrete_test.csv")
    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)
    print(f"Saved train to: {train_path}")
    print(f"Saved test to : {test_path}")


### Next steps
- Use `concrete_train.csv` for model selection via 5-fold CV.
- Keep `concrete_test.csv` untouched for the one-time final evaluation.
- Proceed to `02_poly_reg.ipynb` and `03/04` splines notebooks.
