<a href="https://colab.research.google.com/github/virf96/Basico/blob/main/KernelRidgeRegression_CrossValidation_RandomizedSearchCV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 100)

## Carga de datos

In [26]:
boston = load_boston()
df = pd.DataFrame(data = boston["data"], columns=boston["feature_names"])
df["target"] = boston["target"]
tgt = "target"
ls_features = [x for x in df.columns if x not in [tgt]]

## Preparación de datos

In [27]:
X = df[ls_features]
y = df[[tgt]]

In [28]:
data=pd.concat([X,y],axis=1)
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [29]:
cols_to_use=[]
for i in range(0,len(data.columns)):
             cols_to_use.append(data.columns[i])

cols_to_use.remove('target')

In [30]:
X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],
                                                   data['target'],
                                                   test_size=.3,
                                                   random_state=0)


X_train.shape, X_test.shape, y_train.shape, y_test.shape

((354, 13), (152, 13), (354,), (152,))

In [31]:
# Escalamos set de entrenamiento
Sc = MinMaxScaler()
Sc.fit(X_train)
Xs = Sc.transform(X_train)

Sc_y = MinMaxScaler()
Sc_y.fit(y_train.values.reshape(-1,1))
ys=Sc_y.transform(y_train.values.reshape(-1,1))

## Modelado

### Ridge Regression

In [32]:
model = Ridge()


In [33]:
ls_medias = cross_val_score(estimator=model, X=Xs, y = ys, cv = 4, n_jobs=-1, scoring="r2")
np.mean(ls_medias), np.std(ls_medias)

(0.7202102856981909, 0.051691682523327995)

In [34]:
Ridge?

In [35]:
#Definimos gradilla
param_grid = {
    "alpha": [x/1000 for x in range(5)] + [x for x in range(5)],
    "tol": [0.00001, 0.0000001, 0.01],
    "solver": ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

In [36]:
param_grid

{'alpha': [0.0, 0.001, 0.002, 0.003, 0.004, 0, 1, 2, 3, 4],
 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
 'tol': [1e-05, 1e-07, 0.01]}

In [37]:
clf = GridSearchCV(model, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2", verbose=5)
clf.fit(Xs, ys)
print("Best score: " + str(clf.best_score_))
print("Best estimator: " + str(clf.best_estimator_))

Fitting 4 folds for each of 210 candidates, totalling 840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.1s


Best score: 0.7205705228454813
Best estimator: Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='sag', tol=0.01)


[Parallel(n_jobs=-1)]: Done 837 out of 840 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 840 out of 840 | elapsed:    1.7s finished


In [38]:
clf.best_score_

0.7205705228454813

### Kernel Ridge Regression

In [39]:
param_grid = {"alpha": [x/100 for x in range(5)],
              "kernel": ['linear', 'poly', 'rbf', 'sigmoid', "chi2", "laplacian", "exponential"], 
              "degree": [1, 2, 3],
              "gamma": [x/10 for x in range(5)]}

In [40]:
param_grid

{'alpha': [0.0, 0.01, 0.02, 0.03, 0.04],
 'degree': [1, 2, 3],
 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
 'kernel': ['linear',
  'poly',
  'rbf',
  'sigmoid',
  'chi2',
  'laplacian',
  'exponential']}

In [41]:
n_hyper = np.product([x for x in map(len, param_grid.values())])

In [42]:
n_hyper

525

In [43]:
model = KernelRidge()
model.fit(Xs, ys)
clf_2 = RandomizedSearchCV(model, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2", verbose=5, n_iter=n_hyper*.25)
clf_2.fit(Xs, ys)
print("Best score: " + str(clf_2.best_score_))

Fitting 4 folds for each of 131 candidates, totalling 524 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.4s


Best score: 0.9044580724979976


[Parallel(n_jobs=-1)]: Done 524 out of 524 | elapsed:    3.1s finished


In [44]:
clf_2.best_score_

0.9044580724979976

In [45]:
clf_2.best_estimator_

KernelRidge(alpha=0.01, coef0=1, degree=1, gamma=0.4, kernel='laplacian',
            kernel_params=None)

### Pruebas del modelo ganador

In [46]:
#Escalamos variables
Xt = Sc.transform(X_test)

In [47]:
predicciones=Sc_y.inverse_transform(clf_2.best_estimator_.predict(Xt))

In [48]:
r2_score(y_pred=predicciones, y_true=y_test)

0.8209896088681984

In [49]:
clf_2.best_estimator_.score(Xt, Sc_y.transform(y_test.values.reshape(-1,1)))

0.8209896088681984