In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [7]:
df = pd.read_csv('data/analysis_data/merged_data_US.csv')
def ridge_for_state(state):
    sub = df[df.state == state].copy()
    X = sub[['population', 'median_income', 'initial_claims', 'lfp_rate']]
    y = sub['unemployment_rate']

    model = Ridge(alpha=1.0)
    model.fit(X, y)
    print(f'For {state}:')
    print("Intercept:", model.intercept_)
    print("Coefficients:", dict(zip(X.columns, model.coef_)))

    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2   = r2_score(y, y_pred)

    print("RMSE:", rmse)
    print("R²:",  r2)
    plt.figure(figsize=(5,2))
    plt.scatter(y, y_pred, alpha=0.7)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)  # 45° line
    plt.xlabel('Actual Unemployment Rate')
    plt.ylabel('Predicted Unemployment Rate')
    plt.title(f'{state}: Actual vs Predicted')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(4,2))
    plt.plot(sub['month'], y, label='Actual', marker='o')
    plt.plot(sub['month'], y_pred, label='Predicted', marker='x')
    plt.xlabel('Month')
    plt.ylabel('Unemployment Rate')
    plt.title(f'{state}: Unemployment: Actual vs. Predicted Over Time')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'data/analysis_data/merged_data_US.csv'

In [3]:
results = []
for st in df['state'].unique():
    results.append(ridge_for_state(st))

NameError: name 'df' is not defined

# After Tuning:

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model    import RidgeCV
from sklearn.pipeline        import Pipeline
from sklearn.metrics         import mean_squared_error, r2_score

def tuned_ridge_for_state(state):
    sub = df[df.state == state].copy()
    sub['month_num'] = sub['month'].dt.month
    sub['year_num']  = sub['month'].dt.year
    sub['u_lag1']    = sub['unemployment_rate'].shift(1).fillna(method='bfill')

    base_feats = ['population','median_income','initial_claims','lfp_rate','year_num','u_lag1']
    month_dummies = pd.get_dummies(sub['month_num'], prefix='m', drop_first=True)
    X = pd.concat([sub[base_feats], month_dummies], axis=1)
    y = sub['unemployment_rate']

    pipe = Pipeline([
        ('scaler',    StandardScaler()),
        ('ridgecv',   RidgeCV(alphas=np.logspace(-3, 3, 50), cv=5))
    ])

    pipe.fit(X, y)
    y_pred = pipe.predict(X)
    alpha  = pipe.named_steps['ridgecv'].alpha_
    rmse    = np.sqrt(mean_squared_error(y, y_pred))
    r2      = r2_score(y, y_pred)

    print(f"{state}: best α={alpha:.4f} → RMSE={rmse:.3f}, R²={r2:.3f}")
    return {'state': state, 'alpha': alpha, 'rmse': rmse, 'r2': r2}

# Run for all states
results = [tuned_ridge_for_state(s) for s in df.state.unique()]
pd.DataFrame(results).sort_values('r2')