################################################################################
# Exercise: College Applications                                               #
################################################################################


In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

# Models / tools
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import statsmodels.api as sm


RNG = 921101

def load_college(path: str | Path | None = None) -> pd.DataFrame:
    """
    Load the ISLR 'College' dataset robustly.
    Order of attempts:
      1) pydataset (if installed)
      2) statsmodels rdatasets (if online/available)
      3) local CSV at 'path' (if provided)
    """
    if path is not None and Path(path).exists():
        df = pd.read_csv(path)
        return df

    try:
        df = sm.datasets.get_rdataset("College", package="ISLR").data
        return df
    except Exception:
        print('no statsmodels')
        pass

    raise RuntimeError(
        "Could not load 'College'. Provide a local CSV via path= or install pydataset."
    )

def prepare_college(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Encode Private as 0/1, handling Yes/No or already-numeric
    if 'Private' in df.columns:
        if df['Private'].dtype == object:
            df['Private'] = (df['Private'].astype(str).str.lower().str.startswith('y')).astype(int)
        else:
            df['Private'] = pd.to_numeric(df['Private'], errors='coerce').fillna(0).astype(int)

    # Coerce any remaining non-numeric (object) columns to numeric if possible
    for c in df.columns:
        if df[c].dtype == object:
            df[c] = pd.to_numeric(df[c], errors='ignore')

    # Some rdataset versions include row names as index; ensure plain columns only
    df = df.reset_index(drop=True)
    return df

# ------------------------------
# Load + split
# ------------------------------
college = prepare_college(load_college())  # or load_college("College.csv")
assert 'Accept' in college.columns, "Expect 'Accept' column in College dataset."

print(college)


     Private   Apps  Accept  Enroll  Top10perc  Top25perc  F.Undergrad  \
0          1   1660    1232     721         23         52         2885   
1          1   2186    1924     512         16         29         2683   
2          1   1428    1097     336         22         50         1036   
3          1    417     349     137         60         89          510   
4          1    193     146      55         16         44          249   
..       ...    ...     ...     ...        ...        ...          ...   
772        0   2197    1515     543          4         26         3089   
773        1   1959    1805     695         24         47         2849   
774        1   2097    1915     695         34         61         2793   
775        1  10705    2453    1317         95         99         5217   
776        1   2989    1855     691         28         63         2988   

     P.Undergrad  Outstate  Room.Board  Books  Personal  PhD  Terminal  \
0            537      7440        330

In [None]:

train2, test2 = train_test_split(college, test_size=0.2, random_state=RNG)
X_train = train2.drop(columns=['Accept'])
y_train = train2['Accept']
X_test  = test2.drop(columns=['Accept'])
y_test  = test2['Accept']

# ------------------------------
# Linear Regression
# ------------------------------
lm = LinearRegression()
lm.fit(X_train, y_train)
lm_pred = lm.predict(X_test)
print("College — Linear Regression MSE:", mean_squared_error(y_test, lm_pred))

# ------------------------------
# Ridge Regression (CV)
# ------------------------------
lambdas3 = np.logspace(-3, 5, 100)
ridge_cv = RidgeCV(alphas=lambdas3)
ridge_cv.fit(X_train, y_train)
ridge_pred = ridge_cv.predict(X_test)
print("College — Ridge MSE:", mean_squared_error(y_test, ridge_pred))
print("College — Best Ridge alpha:", ridge_cv.alpha_)
print("College — Ridge Coeff count:", ridge_cv.coef_.size)

# ------------------------------
# Lasso (CV)
# ------------------------------
lasso_cv = LassoCV(alphas=lambdas3, max_iter=10000, cv=5, random_state=RNG)
lasso_cv.fit(X_train, y_train)
lasso_pred = lasso_cv.predict(X_test)
print("College — Lasso MSE:", mean_squared_error(y_test, lasso_pred))
print("College — Best Lasso alpha:", lasso_cv.alpha_)
print("College — Nonzero Lasso Coeffs:", np.sum(lasso_cv.coef_ != 0))

# ------------------------------
# PCR: PCA -> LinearRegression
# ------------------------------
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

pca = PCA()
pca.fit(X_train_s)
# Match the R script choice (ncomp=16)
ncomp_pcr = min(16, X_train_s.shape[1])
pca_train = pca.transform(X_train_s)[:, :ncomp_pcr]
pca_test  = pca.transform(X_test_s)[:, :ncomp_pcr]

lm_pcr = LinearRegression()
lm_pcr.fit(pca_train, y_train)
pcr_pred = lm_pcr.predict(pca_test)
print("College — PCR MSE:", mean_squared_error(y_test, pcr_pred))

# ------------------------------
# PLS
# ------------------------------
ncomp_pls = min(7, X_train_s.shape[1])
pls = PLSRegression(n_components=ncomp_pls)
pls.fit(X_train_s, y_train)
pls_pred = pls.predict(X_test_s).ravel()
print("College — PLS MSE:", mean_squared_error(y_test, pls_pred))



College — Linear Regression MSE: 464958.2158424531
College — Ridge MSE: 464946.4308068782
College — Best Ridge alpha: 2.4770763559917115
College — Ridge Coeff count: 17
College — Lasso MSE: 468519.80809350254
College — Best Lasso alpha: 102.35310218990269
College — Nonzero Lasso Coeffs: 15
College — PCR MSE: 488627.3260415615
College — PLS MSE: 479724.9598733734


################################################################################
# Exercise: Predicting Crime Rates                                             #
################################################################################


In [4]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

RNG = 921101

def load_boston(path: str | Path | None = None) -> pd.DataFrame:
    """
    Load MASS::Boston robustly.
    Order of attempts:
      1) statsmodels rdatasets (if online/available)
      2) sklearn.openml 'boston' (may be deprecated in some versions)
      3) local CSV at 'path'
    Returns a DataFrame with column 'crim' present.
    """
    if path is not None and Path(path).exists():
        df = pd.read_csv(path)
        return df

    # Try statsmodels rdatasets
    try:
        import statsmodels.api as sm
        df = sm.datasets.get_rdataset("Boston", package="MASS").data
        return df
    except Exception:
        pass


    raise RuntimeError(
        "Could not load 'Boston'. Provide a local CSV via path=."
    )

def prepare_boston(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # Coerce object columns if any (usually none here)
    for c in df.columns:
        if df[c].dtype == object:
            df[c] = pd.to_numeric(df[c], errors='ignore')
    df = df.reset_index(drop=True)
    return df

# ------------------------------
# Load + split
# ------------------------------
boston = prepare_boston(load_boston())  # or load_boston("Boston.csv")
assert 'crim' in boston.columns, "Expect 'crim' column in Boston dataset."

boston

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [6]:

train3, test3 = train_test_split(boston, test_size=0.2, random_state=RNG)
X_train = train3.drop(columns=['crim'])
y_train = train3['crim']
X_test  = test3.drop(columns=['crim'])
y_test  = test3['crim']

# ------------------------------
# Linear Regression
# ------------------------------
lm = LinearRegression()
lm.fit(X_train, y_train)
lm_pred = lm.predict(X_test)
print("Boston — Linear Regression MSE:", mean_squared_error(y_test, lm_pred))

# ------------------------------
# Ridge (CV)
# ------------------------------
lambdas4 = np.logspace(-3, 5, 200)

ridge_cv = RidgeCV(alphas=lambdas4)  # removed store_cv_values
ridge_cv.fit(X_train, y_train)
ridge_pred = ridge_cv.predict(X_test)
print("Boston — Ridge MSE:", mean_squared_error(y_test, ridge_pred))
print("Boston — Best Ridge alpha:", ridge_cv.alpha_)

# ------------------------------
# Lasso (CV)
# ------------------------------
lasso_cv = LassoCV(alphas=lambdas4, max_iter=10000, cv=5, random_state=RNG)
lasso_cv.fit(X_train, y_train)
lasso_pred = lasso_cv.predict(X_test)
print("Boston — Lasso MSE:", mean_squared_error(y_test, lasso_pred))
print("Boston — Best Lasso alpha:", lasso_cv.alpha_)
print("Boston — Nonzero Lasso Coeffs:", np.sum(lasso_cv.coef_ != 0))

# ------------------------------
# PCR
# ------------------------------
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

pca = PCA()
pca.fit(X_train_s)
ncomp_pcr = min(8, X_train_s.shape[1])  # match R choice (8)
pca_train = pca.transform(X_train_s)[:, :ncomp_pcr]
pca_test  = pca.transform(X_test_s)[:, :ncomp_pcr]

lm_pcr = LinearRegression()
lm_pcr.fit(pca_train, y_train)
pcr_pred = lm_pcr.predict(pca_test)
print("Boston — PCR MSE:", mean_squared_error(y_test, pcr_pred))

# ------------------------------
# PLS
# ------------------------------
ncomp_pls = min(8, X_train_s.shape[1])
pls = PLSRegression(n_components=ncomp_pls)
pls.fit(X_train_s, y_train)
pls_pred = pls.predict(X_test_s).ravel()
print("Boston — PLS MSE:", mean_squared_error(y_test, pls_pred))


Boston — Linear Regression MSE: 52.215972882816146
Boston — Ridge MSE: 51.756619304224394
Boston — Best Ridge alpha: 202.55019392306664
Boston — Lasso MSE: 51.78536727714496
Boston — Best Lasso alpha: 0.3107866187782014
Boston — Nonzero Lasso Coeffs: 9
Boston — PCR MSE: 52.25404607217897
Boston — PLS MSE: 52.15603973202132
