In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [19]:
data = fetch_california_housing(as_frame=True)
X = pd.DataFrame(data.data, columns=data.feature_names)[['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']]
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [20]:
X_train[:3]

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup
17853,5.3994,23.0,5.019157,1.022989,910.0,3.48659
15963,3.9567,52.0,5.173664,1.127863,1848.0,3.526718
20106,3.05,17.0,5.383764,1.095941,753.0,2.778598


In [24]:
# Create pipeline with scaler and SGDRegressor
pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("sgdr", SGDRegressor())
])

# Define parameter grid
param_grid = {
    "sgdr__loss": ["huber", "squared_error"],
    "sgdr__penalty": ["l1", "l2"],
    "sgdr__alpha": [0.0001, 0.001, 0.01, 0.1],
    "sgdr__learning_rate": ["constant", "invscaling", "adaptive"],
}

# Create GridSearchCV with pipeline
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="neg_mean_absolute_error")

# Fit the model
grid_search.fit(X_train, y_train)

# Make predictions on test data
y_pred = grid_search.predict(X_test)

# Calculate MAE and MAPE
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred) * 100  # Convert to percentage

# Print results
print("Best SGDRegressor parameters:", grid_search.best_params_)
print("MAE:", mae)
print("MAPE:", mape)

Best SGDRegressor parameters: {'sgdr__alpha': 0.0001, 'sgdr__learning_rate': 'constant', 'sgdr__loss': 'huber', 'sgdr__penalty': 'l1'}
MAE: 0.560597598459277
MAPE: 31.146125850540063
