In [48]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR, LinearSVR
from sklearn.metrics import root_mean_squared_error
from scipy.stats import loguniform, uniform

 Train and fine-tune an SVM regressor on the California housing dataset. The targets represent hundreds of thousands of dollars.
Since there are over 20,000 instances, SVMs can be slow, so for hyperparameter tuning you should use far
fewer instances (e.g., 2,000) to test many more hyperparameter combinations. What is your best model’s
RMSE?

In [7]:
housing = datasets.fetch_california_housing()

In [8]:
list(housing)

['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR']

In [11]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [14]:
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

In [16]:
help(LinearSVR)

Help on class LinearSVR in module sklearn.svm._classes:

class LinearSVR(sklearn.base.RegressorMixin, sklearn.linear_model._base.LinearModel)
 |  LinearSVR(*, epsilon=0.0, tol=0.0001, C=1.0, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual='auto', verbose=0, random_state=None, max_iter=1000)
 |
 |  Linear Support Vector Regression.
 |
 |  Similar to SVR with parameter kernel='linear', but implemented in terms of
 |  liblinear rather than libsvm, so it has more flexibility in the choice of
 |  penalties and loss functions and should scale better to large numbers of
 |  samples.
 |
 |  The main differences between :class:`~sklearn.svm.LinearSVR` and
 |  :class:`~sklearn.svm.SVR` lie in the loss function used by default, and in
 |  the handling of intercept regularization between those two implementations.
 |
 |  This class supports both dense and sparse input.
 |
 |  Read more in the :ref:`User Guide <svm_regression>`.
 |
 |  .. versionadded:: 0.16
 |
 |  Param

In [22]:
lin_svr = make_pipeline(
    StandardScaler(),
    LinearSVR(dual=True, random_state=42)
)

lin_svr.fit(X_train, y_train)



In [24]:
lin_svr = make_pipeline(
    StandardScaler(),
    LinearSVR(dual=True, random_state=42, max_iter=5000)
)

lin_svr.fit(X_train, y_train)

In [32]:
y_pred = lin_svr.predict(X_train)
rmse = root_mean_squared_error(y_train, y_pred)
rmse

0.979565447829459

In [34]:
y_train[:10]

array([1.03   , 3.821  , 1.726  , 0.934  , 0.965  , 2.648  , 1.573  ,
       5.00001, 1.398  , 3.156  ])

In [38]:
lin_svr.predict(X_train[:10])

array([1.70262004, 2.35797199, 2.45312143, 1.27994239, 1.51361142,
       3.21107334, 1.36468991, 4.09510522, 0.75023238, 2.7104876 ])

In [54]:
svm_reg = make_pipeline(StandardScaler(), SVR())

param_distribs = {
    "svr__gamma": loguniform(0.001, 0.1),
    "svr__C": uniform(1, 10)
}

rnd_search_cv = RandomizedSearchCV(svm_reg, param_distribs, n_iter=5, cv=3, scoring="neg_root_mean_squared_error", random_state=42)
rnd_search_cv.fit(X_train[:2000], y_train[:2000])

In [60]:
rnd_search_cv.best_estimator_

In [64]:
rnd_search_cv.best_score_

-0.5643699696723772

In [66]:
-cross_val_score(rnd_search_cv.best_estimator_, X_train, y_train, scoring="neg_root_mean_squared_error")

array([0.59078454, 0.57839092, 0.58429074, 0.57390926, 0.60218967])

In [68]:
y_pred = rnd_search_cv.best_estimator_.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
rmse

0.5874570680435549