In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline   

In [28]:
data = pd.read_csv("StudentCleaned.csv")

In [29]:
data.columns

Index(['gender', 'parental level of education', 'lunch',
       'test preparation course', 'race/ethnicity_group B',
       'race/ethnicity_group C', 'race/ethnicity_group D',
       'race/ethnicity_group E', 'avg_score'],
      dtype='object')

In [30]:
X = data.drop('avg_score', axis=1)

In [31]:
y = data['avg_score']

In [32]:
X.head()

Unnamed: 0,gender,parental level of education,lunch,test preparation course,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E
0,1,4,0,0,True,False,False,False
1,1,3,0,1,False,True,False,False
2,1,5,0,0,True,False,False,False
3,0,2,1,0,False,False,False,False
4,0,3,0,0,False,True,False,False


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [34]:
param_grids = {
    'Ridge': {
        'model__alpha': np.logspace(-3, 2, 10)   # 0.001 → 100
    },
    'Lasso': {
        'model__alpha': np.logspace(-3, 1, 10)   # 0.001 → 10
    },
    'ElasticNet': {
        'model__alpha': np.logspace(-3, 1, 10),
        'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
    }
}

In [None]:
best_models = {}

for name, param_grid in param_grids.items():
    if name == 'Ridge':
        model = Ridge()
    elif name == 'Lasso':
        model = Lasso(max_iter=10000)
    else:
        model = ElasticNet(max_iter=10000)
    
    # Build pipeline: scaling + model
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    # Grid search with 5-fold CV
    grid = GridSearchCV(pipe, param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid.fit(X_train, y_train)
    
    best_models[name] = grid.best_estimator_
    print(f"🔹 {name}:")
    print(f"   Best Params: {grid.best_params_}")
    print(f"   Best CV R²: {grid.best_score_:.4f}")
    print("-" * 50)

# 7️⃣ Evaluate all best models on test set
for name, model in best_models.items():
    test_score = model.score(X_test, y_test)
    print(f"✅ {name} Test R²: {test_score:.4f}")


🔹 Ridge:
   Best Params: {'model__alpha': np.float64(7.742636826811277)}
   Best CV R²: 0.1861
--------------------------------------------------
🔹 Lasso:
   Best Params: {'model__alpha': np.float64(0.001)}
   Best CV R²: 0.1859
--------------------------------------------------
🔹 ElasticNet:
   Best Params: {'model__alpha': np.float64(0.021544346900318832), 'model__l1_ratio': 0.1}
   Best CV R²: 0.1861
--------------------------------------------------
✅ Ridge Test R²: 0.2675
✅ Lasso Test R²: 0.2679
✅ ElasticNet Test R²: 0.2669


In [36]:
best_models

{'Ridge': Pipeline(steps=[('scaler', StandardScaler()),
                 ('model', Ridge(alpha=np.float64(7.742636826811277)))]),
 'Lasso': Pipeline(steps=[('scaler', StandardScaler()),
                 ('model', Lasso(alpha=np.float64(0.001), max_iter=10000))]),
 'ElasticNet': Pipeline(steps=[('scaler', StandardScaler()),
                 ('model',
                  ElasticNet(alpha=np.float64(0.021544346900318832),
                             l1_ratio=0.1, max_iter=10000))])}

In [37]:
import joblib

best_model = best_models['ElasticNet']
joblib.dump(best_model, 'best_regression_model.pkl')

print("✅ Model saved as 'best_regression_model.pkl'")

✅ Model saved as 'best_regression_model.pkl'
