<a href="https://colab.research.google.com/github/virf96/Basico/blob/main/ModelosRegularizaci%C3%B3n_CrossValidation_Hiperparametrizaci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
pd.set_option("max_columns", 500)
from sklearn.preprocessing import MinMaxScaler

In [77]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet, BayesianRidge, Lars

In [78]:
boston = load_boston() ## Load and return the boston house-prices dataset (regression).

In [79]:
boston

{'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate p

In [80]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [81]:
print(boston['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [82]:
df = pd.DataFrame(data = boston["data"], columns=boston["feature_names"])

In [83]:
df["target"] = boston["target"]
tgt = "target"

In [84]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [85]:
ls_pred = [x for x in df.columns if x not in [tgt]]

In [86]:
X = df[ls_pred]
y = df[tgt]

In [87]:
from sklearn.model_selection import train_test_split

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [89]:
#Escalamos variables
Sc = MinMaxScaler()
Sc.fit(X_train)
Xs = Sc.transform(X_train)

Sc_y = MinMaxScaler()
Sc_y.fit(y_train.values.reshape(-1,1))
ys = Sc_y.transform(y_train.values.reshape(-1,1))

## **LASSO**

# Cross Validation

### No escalado

In [90]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score

In [91]:
model_lasso = Lasso()
ls_medias = cross_val_score(estimator=model_lasso, X=X_train, y = y_train, cv = 4, n_jobs=-1, scoring="r2")
ls_medias

array([0.63671538, 0.61341359, 0.66245383, 0.64717443])

In [92]:
np.mean(ls_medias), np.std(ls_medias)

(0.6399393051313349, 0.017841369424064828)

### Escalado

In [93]:
ls_medias = cross_val_score(estimator=model_lasso, X=Xs, y = ys, cv = 4, n_jobs=-1, scoring="r2")
ls_medias

array([-0.02130652, -0.00762276, -0.00156278, -0.0103245 ])

In [94]:
np.mean(ls_medias), np.std(ls_medias)

(-0.010204142252167592, 0.007152166762687557)

# GridSearch

In [95]:
model_lasso.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [96]:
param_grid = {
    "alpha": [x for x in range(1, 100)] + [y/10 for y in range(10)],
    "tol": [0.00001, 0.0000001, 0.01],
    "selection": ['cyclic', 'random']
}

In [97]:
param_grid

{'alpha': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  0.0,
  0.1,
  0.2,
  0.3,
  0.4,
  0.5,
  0.6,
  0.7,
  0.8,
  0.9],
 'selection': ['cyclic', 'random'],
 'tol': [1e-05, 1e-07, 0.01]}

In [98]:
np.prod(list(map(len, param_grid.values())))

654

### Sin Escalamiento

In [99]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [100]:
clf = GridSearchCV(model_lasso, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2", verbose=5)
clf.fit(X_train, y_train)
print("Best score: " + str(clf.best_score_))

Fitting 4 folds for each of 654 candidates, totalling 2616 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1660 tasks      | elapsed:    5.1s


Best score: 0.6955762061964565


[Parallel(n_jobs=-1)]: Done 2613 out of 2616 | elapsed:    8.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 2616 out of 2616 | elapsed:    8.0s finished
  self.best_estimator_.fit(X, y, **fit_params)
  positive)
  positive)


In [101]:
# Guardamos los resultados
summary = pd.DataFrame(clf.cv_results_)

In [102]:
summary.head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_selection,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004155,0.001373,0.002046,0.000416,1,cyclic,1e-05,"{'alpha': 1, 'selection': 'cyclic', 'tol': 1e-05}",0.636717,0.61343,0.662428,0.647193,0.639942,0.017829,63
1,0.003126,8.8e-05,0.001773,0.000212,1,cyclic,1e-07,"{'alpha': 1, 'selection': 'cyclic', 'tol': 1e-07}",0.636717,0.613432,0.662425,0.647195,0.639942,0.017828,62
2,0.002988,0.000219,0.001651,4.4e-05,1,cyclic,0.01,"{'alpha': 1, 'selection': 'cyclic', 'tol': 0.01}",0.636483,0.612086,0.66388,0.645892,0.639585,0.018681,66


In [103]:
summary.sort_values(by = "rank_test_score").tail(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_selection,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
589,0.002701,3.6e-05,0.001591,5e-05,99,cyclic,1e-07,"{'alpha': 99, 'selection': 'cyclic', 'tol': 1e...",0.274975,0.057515,0.198304,0.228545,0.189835,0.081129,652
592,0.002688,2.1e-05,0.001595,2.3e-05,99,random,1e-07,"{'alpha': 99, 'selection': 'random', 'tol': 1e...",0.274975,0.057515,0.198304,0.228545,0.189835,0.081129,653
593,0.003655,0.001198,0.001823,0.000403,99,random,0.01,"{'alpha': 99, 'selection': 'random', 'tol': 0.01}",0.275684,0.057439,0.198312,0.227318,0.189688,0.081202,654


In [104]:
dc_scores={}
dc_scores[str(model_lasso).split("(")[0]] = {"model": clf.best_estimator_, "score": clf.best_score_}
dc_scores

{'Lasso': {'model': Lasso(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=1000,
        normalize=False, positive=False, precompute=False, random_state=None,
        selection='cyclic', tol=1e-05, warm_start=False),
  'score': 0.6955762061964565}}

### Escalado

In [105]:
clf = GridSearchCV(model_lasso, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2", verbose=5)
clf.fit(Xs, ys)
print("Best score: " + str(clf.best_score_))

Fitting 4 folds for each of 654 candidates, totalling 2616 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:    0.4s


Best score: 0.6955762061964569


[Parallel(n_jobs=-1)]: Done 2616 out of 2616 | elapsed:    3.7s finished
  self.best_estimator_.fit(X, y, **fit_params)
  positive)
  positive)


In [106]:
dc_scores[str(model_lasso).split("(")[0]] = {"model": clf.best_estimator_, "score": clf.best_score_}
dc_scores

{'Lasso': {'model': Lasso(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=1000,
        normalize=False, positive=False, precompute=False, random_state=None,
        selection='random', tol=0.01, warm_start=False),
  'score': 0.6955762061964569}}

## **RIDGE**

In [107]:
model_ridge = Ridge()

### Cross Validation Sin escalar

In [108]:
model_ridge.fit(X_train, y_train)
ls_medias_ridge = cross_val_score(estimator=model_ridge, X=X_test, y = y_test, cv = 4, n_jobs=-1, scoring="r2")
np.mean(ls_medias_ridge), np.std(ls_medias_ridge)

(0.6247230080506039, 0.12406489874070754)

### Cross Validation Escalando

In [109]:
model_ridge.fit(Xs, ys)
ls_medias_ridge = cross_val_score(estimator=model_ridge, X=Xs, y = ys, cv = 4, n_jobs=-1, scoring="r2")
np.mean(ls_medias_ridge), np.std(ls_medias_ridge)

(0.6947906117322985, 0.05409209648907116)

#Grid Search Sin Escalado

In [110]:
model_ridge.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

In [111]:
param_grid = {
    "alpha": [x for x in range(1, 100)] + [y/10 for y in range(10)],
    "tol": [0.00001, 0.0000001, 0.01],
    "solver": ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

In [112]:
np.prod(list(map(len, param_grid.values())))

2289

In [113]:
clf_ridge = GridSearchCV(model_ridge, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2")
clf_ridge.fit(X_train, y_train)
print("Best score: " + str(clf_ridge.best_score_))

Best score: 0.6958329991743583


In [114]:
summary_ridge = pd.DataFrame(clf_ridge.cv_results_)
summary_ridge.head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_solver,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003824,0.000357,0.001968,0.000432,1,auto,1e-05,"{'alpha': 1, 'solver': 'auto', 'tol': 1e-05}",0.571147,0.696962,0.776855,0.730042,0.693752,0.076265,134
1,0.002792,0.00025,0.002252,0.00063,1,auto,1e-07,"{'alpha': 1, 'solver': 'auto', 'tol': 1e-07}",0.571147,0.696962,0.776855,0.730042,0.693752,0.076265,134
2,0.003359,0.000489,0.0018,0.000118,1,auto,0.01,"{'alpha': 1, 'solver': 'auto', 'tol': 0.01}",0.571147,0.696962,0.776855,0.730042,0.693752,0.076265,134


In [115]:
summary_ridge.sort_values(by = "rank_test_score").tail(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_solver,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
1847,0.006227,0.000327,0.001418,8e-05,88,saga,0.01,"{'alpha': 88, 'solver': 'saga', 'tol': 0.01}",0.594932,0.494995,0.550715,0.560429,0.550268,0.035893,2287
1889,0.007374,0.000394,0.001727,5e-05,90,saga,0.01,"{'alpha': 90, 'solver': 'saga', 'tol': 0.01}",0.594149,0.495144,0.550482,0.561085,0.550215,0.035641,2288
2078,0.012637,0.006637,0.001929,4e-05,99,saga,0.01,"{'alpha': 99, 'solver': 'saga', 'tol': 0.01}",0.595411,0.494793,0.55002,0.560476,0.550175,0.036123,2289


In [116]:
dc_scores[str(model_ridge).split("(")[0]] = {"model": clf_ridge.best_estimator_, "score": clf_ridge.best_score_}
dc_scores

{'Lasso': {'model': Lasso(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=1000,
        normalize=False, positive=False, precompute=False, random_state=None,
        selection='random', tol=0.01, warm_start=False),
  'score': 0.6955762061964569},
 'Ridge': {'model': Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
        normalize=False, random_state=None, solver='sparse_cg', tol=1e-05),
  'score': 0.6958329991743583}}

## Grid Search Escalado

In [117]:
clf_ridge = GridSearchCV(model_ridge, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2")
clf_ridge.fit(Xs, ys)
print("Best score: " + str(clf_ridge.best_score_))

Best score: 0.6989809156847101


In [118]:
dc_scores[str(model_ridge).split("(")[0]] = {"model": clf_ridge.best_estimator_, "score": clf_ridge.best_score_}
dc_scores

{'Lasso': {'model': Lasso(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=1000,
        normalize=False, positive=False, precompute=False, random_state=None,
        selection='random', tol=0.01, warm_start=False),
  'score': 0.6955762061964569},
 'Ridge': {'model': Ridge(alpha=0.4, copy_X=True, fit_intercept=True, max_iter=None,
        normalize=False, random_state=None, solver='sag', tol=0.01),
  'score': 0.6989809156847101}}

## **Elastic Net**

In [119]:
model_elastic = ElasticNet()

### Cross Validation Sin escalar

In [120]:
model_elastic.fit(X_train, y_train)
ls_medias_elastic = cross_val_score(estimator=model_elastic, X=X_test, y = y_test, cv = 4, n_jobs=-1, scoring="r2")
np.mean(ls_medias_elastic), np.std(ls_medias_elastic)

(0.636130127759391, 0.10218558607224816)

### Cross Validation Escalado

In [123]:
model_elastic.fit(Xs, ys)
ls_medias_elastic = cross_val_score(estimator=model_elastic, X=Xs, y = ys, cv = 4, n_jobs=-1, scoring="r2")
np.mean(ls_medias_elastic), np.std(ls_medias_elastic)

(-0.010204142252167592, 0.007152166762687557)

### Grid Search Sin escalar

In [124]:
model_elastic.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'l1_ratio': 0.5,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [125]:
param_grid = {
    "alpha": [x for x in range(1, 100)] + [y/10 for y in range(10)],
    "l1_ratio": [x/1000 for x in range(1, 100)] + [y/10 for y in range(10)],
    "selection": ["cyclic", "random"]
}

In [126]:
np.prod(list(map(len, param_grid.values())))

23762

In [127]:
clf_elastic = GridSearchCV(model_elastic, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2", verbose=5,)
clf_elastic.fit(X_train, y_train)
print("Best score: " + str(clf_elastic.best_score_))

Fitting 4 folds for each of 23762 candidates, totalling 95048 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1660 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 4540 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 8572 tasks      | elapsed:   26.9s
[Parallel(n_jobs=-1)]: Done 13756 tasks      | elapsed:   42.7s
[Parallel(n_jobs=-1)]: Done 20092 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 27580 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 36220 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 46012 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 56956 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 69052 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 82300 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 95048 out of 95048 | elapsed:  4.8min finished


Best score: 0.6955762061964568


  self.best_estimator_.fit(X, y, **fit_params)
  positive)
  positive)


In [128]:
summary_elastic = pd.DataFrame(clf_elastic.cv_results_)

In [129]:
summary_elastic.sort_values(by = "rank_test_score").head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,param_selection,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
21699,0.009169,3.7e-05,0.00172,1.7e-05,0,0.059,random,"{'alpha': 0.0, 'l1_ratio': 0.059, 'selection':...",0.582539,0.704626,0.763352,0.731787,0.695576,0.068491,1
21613,0.00933,8.2e-05,0.001754,5.2e-05,0,0.016,random,"{'alpha': 0.0, 'l1_ratio': 0.016, 'selection':...",0.582539,0.704626,0.763352,0.731787,0.695576,0.068491,1
21627,0.008846,0.000107,0.001686,7.1e-05,0,0.023,random,"{'alpha': 0.0, 'l1_ratio': 0.023, 'selection':...",0.582539,0.704626,0.763352,0.731787,0.695576,0.068491,1


In [130]:
dc_scores[str(model_elastic).split("(")[0]] = {"model": clf_elastic.best_estimator_, "score": clf_elastic.best_score_}

In [131]:
dc_scores

{'ElasticNet': {'model': ElasticNet(alpha=0.0, copy_X=True, fit_intercept=True, l1_ratio=0.002,
             max_iter=1000, normalize=False, positive=False, precompute=False,
             random_state=None, selection='random', tol=0.0001, warm_start=False),
  'score': 0.6955762061964568},
 'Lasso': {'model': Lasso(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=1000,
        normalize=False, positive=False, precompute=False, random_state=None,
        selection='random', tol=0.01, warm_start=False),
  'score': 0.6955762061964569},
 'Ridge': {'model': Ridge(alpha=0.4, copy_X=True, fit_intercept=True, max_iter=None,
        normalize=False, random_state=None, solver='sag', tol=0.01),
  'score': 0.6989809156847101}}

### Grid Search Escalado

In [132]:
clf_elastic = GridSearchCV(model_elastic, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2", verbose=5,)
clf_elastic.fit(Xs, ys)
print("Best score: " + str(clf_elastic.best_score_))

Fitting 4 folds for each of 23762 candidates, totalling 95048 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 3068 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 8828 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 16892 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done 27260 tasks      | elapsed:   38.7s
[Parallel(n_jobs=-1)]: Done 39932 tasks      | elapsed:   56.5s
[Parallel(n_jobs=-1)]: Done 54908 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 72188 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 91772 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 95048 out of 95048 | elapsed:  2.3min finished


Best score: 0.695576206196457


  self.best_estimator_.fit(X, y, **fit_params)
  positive)
  positive)


In [133]:
dc_scores[str(model_elastic).split("(")[0]] = {"model": clf_elastic.best_estimator_, "score": clf_elastic.best_score_}
dc_scores

{'ElasticNet': {'model': ElasticNet(alpha=0.0, copy_X=True, fit_intercept=True, l1_ratio=0.058,
             max_iter=1000, normalize=False, positive=False, precompute=False,
             random_state=None, selection='random', tol=0.0001, warm_start=False),
  'score': 0.695576206196457},
 'Lasso': {'model': Lasso(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=1000,
        normalize=False, positive=False, precompute=False, random_state=None,
        selection='random', tol=0.01, warm_start=False),
  'score': 0.6955762061964569},
 'Ridge': {'model': Ridge(alpha=0.4, copy_X=True, fit_intercept=True, max_iter=None,
        normalize=False, random_state=None, solver='sag', tol=0.01),
  'score': 0.6989809156847101}}

## **Realizamos una busqueda aleatoria de gradilla para ver si conseguimos mejores resultados**

## Sin esacalar

In [134]:
clf = RandomizedSearchCV(n_iter=2000, estimator=model_elastic, param_distributions=param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2", verbose=5)
clf.fit(X_train, y_train)
print("Best score: " + str(clf.best_score_))

Fitting 4 folds for each of 2000 candidates, totalling 8000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1660 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 4540 tasks      | elapsed:   13.9s


Best score: 0.6955762061964567


[Parallel(n_jobs=-1)]: Done 8000 out of 8000 | elapsed:   24.3s finished
  self.best_estimator_.fit(X, y, **fit_params)
  positive)
  positive)


## Escalado

In [135]:
clf = RandomizedSearchCV(n_iter=2000, estimator=model_elastic, param_distributions=param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2", verbose=5)
clf.fit(Xs, ys)
print("Best score: " + str(clf.best_score_))

Fitting 4 folds for each of 2000 candidates, totalling 8000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 3068 tasks      | elapsed:    4.7s


Best score: 0.6955762061964568


[Parallel(n_jobs=-1)]: Done 7974 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 8000 out of 8000 | elapsed:   11.5s finished
  self.best_estimator_.fit(X, y, **fit_params)
  positive)
  positive)


# **Guardar el Modelo**

In [136]:
dc_scores

{'ElasticNet': {'model': ElasticNet(alpha=0.0, copy_X=True, fit_intercept=True, l1_ratio=0.058,
             max_iter=1000, normalize=False, positive=False, precompute=False,
             random_state=None, selection='random', tol=0.0001, warm_start=False),
  'score': 0.695576206196457},
 'Lasso': {'model': Lasso(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=1000,
        normalize=False, positive=False, precompute=False, random_state=None,
        selection='random', tol=0.01, warm_start=False),
  'score': 0.6955762061964569},
 'Ridge': {'model': Ridge(alpha=0.4, copy_X=True, fit_intercept=True, max_iter=None,
        normalize=False, random_state=None, solver='sag', tol=0.01),
  'score': 0.6989809156847101}}

In [137]:
#Mejor modelo
dc_scores['Ridge']['model']

Ridge(alpha=0.4, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='sag', tol=0.01)

In [138]:
#Lo guardamos en formato pickle
pd.to_pickle(dc_scores['Ridge']['model'],'model_ridge_housing.pkl')

In [139]:
#Leemos el modelo guardado
modelo_ganador=pd.read_pickle('model_ridge_housing.pkl')

In [140]:
modelo_ganador

Ridge(alpha=0.4, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='sag', tol=0.01)

In [141]:
#Predecimos en el test
#Regresamos a la escala original porque el target estaba escalado en el modelo ganador y escalamos el x_test porque también estaba escalado en el entrenamiento
predicciones=Sc_y.inverse_transform(modelo_ganador.predict(Sc.transform(X_test)))
predicciones

array([[13.35258618],
       [22.02750404],
       [24.93182569],
       [17.49652493],
       [37.97115343],
       [20.35751324],
       [25.65239635],
       [14.15498119],
       [23.62587806],
       [14.88056055],
       [21.13710803],
       [31.87000284],
       [28.78711116],
       [ 5.55431079],
       [20.65365966],
       [27.15903204],
       [30.67525065],
       [25.59185266],
       [ 4.91598776],
       [18.18480271],
       [23.21906599],
       [16.59956443],
       [35.10515738],
       [16.28352413],
       [23.7391352 ],
       [21.02497794],
       [19.30664542],
       [23.19654541],
       [21.0062187 ],
       [32.9749458 ],
       [33.49402159],
       [24.48851882],
       [12.82990108],
       [21.40575453],
       [15.7834206 ],
       [32.5461329 ],
       [25.26610182],
       [29.68867027],
       [30.69825721],
       [20.86687419],
       [31.51640916],
       [35.8878449 ],
       [18.57310627],
       [ 2.87665831],
       [15.67585391],
       [23

In [142]:
#Veamos el preformance
#Parece que funciona mejor con el set de prueba
from sklearn.metrics import r2_score
r2_score(y_pred=predicciones, y_true=y_test)

0.7640231392564862

In [144]:
#Predecimos en el target escalado
Xt=Sc.transform(X_test)
modelo_ganador.score(Xt, Sc_y.transform(y_test.values.reshape(-1,1)))

0.7640231392564861