In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, BayesianRidge, LassoLars, RANSACRegressor, TheilSenRegressor, HuberRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from xgboost import XGBRegressor
from xgboost import plot_tree
from sklearn import datasets
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.pylab import rcParams
import seaborn as sns
%matplotlib inline

In [None]:
sns.set_style("darkgrid")
plt.rcParams["font.size"] = 8
plt.rcParams["figure.figsize"] = (4,3)
plt.rcParams["figure.facecolor"] = "#00000000"

In [None]:
x, y = datasets.fetch_california_housing(return_X_y=True, as_frame=True)
x.head()

In [None]:
y.head()

In [None]:
x.describe()

In [None]:
# check Dtype for varible
x.info()

In [None]:
# Fill / Remove Missing Values
x.isna().sum()

In [None]:
# split data
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size=0.2, random_state=42)
len(x_train), len(x_test), len(y_train), len(y_test)

## Baseline Model

In [None]:
# Linear Model
linreg_model = LinearRegression().fit(x_train, y_train)
train_preds = linreg_model.predict(x_train)
test_preds = linreg_model.predict(x_test)
print("Train RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Train MSE: ", mean_squared_error(y_train, train_preds))
print("Train R2: ", r2_score(y_train, train_preds))
print("Test RMSE: ", root_mean_squared_error(y_test, test_preds))

In [None]:
# Ridge Model
ridge_model = Ridge(random_state=42).fit(x_train, y_train)
train_preds = ridge_model.predict(x_train)
test_preds = ridge_model.predict(x_test)
print("Train RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test RMSE: ", root_mean_squared_error(y_test, test_preds))

In [None]:
# RidgeCV implements ridge regression with built-in cross-validation of the alpha parameter.
ridgecv_model = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(x_train, y_train)
train_preds = ridgecv_model.predict(x_train)
test_preds = ridgecv_model.predict(x_test)
print("Train RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test RMSE: ", root_mean_squared_error(y_test, test_preds))

In [None]:
# Bayesian Ridge model
bayesian_ridge_model = BayesianRidge().fit(np.array(x_train), np.array(y_train)) # convert to array
train_preds = bayesian_ridge_model.predict( np.array(x_train))
test_preds = bayesian_ridge_model.predict( np.array(x_test))
print("Train RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test RMSE: ", root_mean_squared_error(y_test, test_preds))

In [None]:
# Lasso Model
lasso_model = Lasso(random_state=42).fit(x_train, y_train)
train_preds = lasso_model.predict(x_train)
test_preds = lasso_model.predict(x_test)
print("Train RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test RMSE: ", root_mean_squared_error(y_test, test_preds))

In [None]:
# LARS lasso model
lars_lasso_model = LassoLars(alpha=0.5, random_state=42).fit(x_train, y_train)
train_preds = lars_lasso_model.predict(x_train)
test_preds = lasso_model.predict(x_test)
print("Train RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test RMSE: ", root_mean_squared_error(y_test, test_preds))

In [None]:
# Gradient Boosting Regressor model
gbr_model = GradientBoostingRegressor(random_state=42,learning_rate=0.001, n_estimators=100) .fit(x_train, y_train)
train_preds = gbr_model.predict(x_train)
test_preds = gbr_model.predict(x_test)
print("Train RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test RMSE: ", root_mean_squared_error(y_test, test_preds))

In [None]:
# Bagging Regressor
br_model = BaggingRegressor(random_state=42, n_estimators=100) .fit(x_train, y_train)
train_preds = br_model.predict(x_train)
test_preds = br_model.predict(x_test)
print("Train RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test RMSE: ", root_mean_squared_error(y_test, test_preds))

In [None]:
# RANSACRegressor, TheilSenRegressor, Huber: Robustness regression: outliers and modeling errors¶ 
ransac_model = RANSACRegressor(random_state=42).fit(x_train, y_train)
train_preds = ransac_model.predict(x_train)
test_preds = ransac_model.predict(x_test)
print("Train RANSAC RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test RANSAC RMSE: ", root_mean_squared_error(y_test, test_preds))

theil_model = TheilSenRegressor(random_state=42).fit(x_train, y_train)
train_preds = theil_model.predict(x_train)
test_preds = theil_model.predict(x_test)
print("Train Theil RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test Theil RMSE: ", root_mean_squared_error(y_test, test_preds))

huber_model = HuberRegressor(max_iter=2000, ).fit(x_train, y_train)
train_preds = huber_model.predict(x_train)
test_preds = huber_model.predict(x_test)
print("Train Huber RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test Huber score: ", huber_model.score(x_train, y_train))
print("Test Huber R2: ", r2_score(y_train, train_preds))
print("Test Huber RMSE: ", root_mean_squared_error(y_test, test_preds))

In [None]:
# Elastinet Model
elastic_model = ElasticNet(random_state=42).fit(x_train, y_train)
elastic_preds = elastic_model.predict(x_train)
elastic_preds = elastic_model.predict(x_test)
print("Train RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test RMSE: ", root_mean_squared_error(y_test, test_preds))

In [None]:
# Random Forest Regressor
random_forest_model = RandomForestRegressor(max_depth=20, n_jobs=-1, random_state=42, 
                                            n_estimators=800, max_samples=800)
random_forest_model.fit(x_train, y_train)
train_preds = random_forest_model.predict(x_train)
test_preds = random_forest_model.predict(x_test)
print("Train RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test RMSE: ", root_mean_squared_error(y_test, test_preds))

In [None]:
# Support Vector Machine for Regression
svc_model = LinearSVR(max_iter=2000, dual="auto", random_state=42) # dual="auto" chooses best parameters for the model automatically
svc_model.fit(x_train, y_train)
train_preds = svc_model.predict(x_train)
test_preds = svc_model.predict(x_test)
print("Train RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test RMSE: ", root_mean_squared_error(y_test, test_preds))                

In [None]:
# XGBRegressor Model
xgb_model = XGBRegressor(random_state=42, n_jobs=-1, objective='reg:squarederror', max_depth=10,
                         n_estimators=200, learning_rate=1e-3)
xgb_model.fit(x_train, y_train)
train_preds = xgb_model.predict(x_train)
test_preds = xgb_model.predict(x_test)
print("Train RMSE: ", root_mean_squared_error(y_train, train_preds))
print("Test RMSE: ", root_mean_squared_error(y_test, test_preds))

In [None]:
import matplotlib.pyplot as plt

def test_params(ModelClass, **params):
    """Trains a model with the given parameters and returns training & validation RMSE"""
    model = ModelClass(**params).fit(x_train, y_train)
    train_rmse = root_mean_squared_error(model.predict(x_train), y_train)
    val_rmse = root_mean_squared_error(model.predict(x_test), y_test)
    return train_rmse, val_rmse

def test_param_and_plot(ModelClass, param_name, param_values, **other_params):
    """Trains multiple models by varying the value of param_name according to param_values"""
    train_errors, val_errors = [], [] 
    for value in param_values:
        params = dict(other_params)
        params[param_name] = value
        train_rmse, val_rmse = test_params(ModelClass, **params)
        train_errors.append(train_rmse)
        val_errors.append(val_rmse)
    
    plt.figure(figsize=(10,6))
    plt.title('Overfitting curve: ' + param_name)
    plt.plot(param_values, train_errors, 'b-o')
    plt.plot(param_values, val_errors, 'r-o')
    plt.xlabel(param_name)
    plt.ylabel('RMSE')
    plt.legend(['Training', 'Validation'])

best_params = {
    'random_state': 42,
    'n_jobs': -1,
    'objective': 'reg:squarederror',
    "max_depth":10,
    "learning_rate": 0.001
}

In [None]:
test_param_and_plot(XGBRegressor, 'n_estimators', [1, 5, 20], **best_params)

In [None]:
import sklearn
sklearn.__version__