# Ejemplo de modelamiento

In [1]:
!pip install wooldridge

Collecting wooldridge
  Downloading wooldridge-0.4.4-py3-none-any.whl (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: wooldridge
Successfully installed wooldridge-0.4.4


In [2]:
import wooldridge as wd
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error

In [3]:
wage = wd.data('wage2')

In [4]:
wage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 17 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   wage     935 non-null    int64  
 1   hours    935 non-null    int64  
 2   IQ       935 non-null    int64  
 3   KWW      935 non-null    int64  
 4   educ     935 non-null    int64  
 5   exper    935 non-null    int64  
 6   tenure   935 non-null    int64  
 7   age      935 non-null    int64  
 8   married  935 non-null    int64  
 9   black    935 non-null    int64  
 10  south    935 non-null    int64  
 11  urban    935 non-null    int64  
 12  sibs     935 non-null    int64  
 13  brthord  852 non-null    float64
 14  meduc    857 non-null    float64
 15  feduc    741 non-null    float64
 16  lwage    935 non-null    float64
dtypes: float64(4), int64(13)
memory usage: 124.3 KB


In [6]:
wage = wage[['wage', 'educ', 'exper']]

In [7]:
y = wage['wage']
X = wage[['educ', 'exper']]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
linreg_pred = linreg.predict(X_test)
mean_squared_error(linreg_pred, y_test)

131359.4322838416

$$\min_{\hat{\beta}} \sum_{i=1}^n (y_i-\hat{y}_i)^2$$

## Ridge

$$\min_{\hat{\beta}} \sum_{i=1}^n (y_i-\hat{y}_i)^2 + \lambda_{ridge}\sum_{j=1}^k\hat{\beta}_j^2$$

In [11]:
ridge_reg = Ridge()
param_grid = {'alpha' : [0, 5, 10]}
gs_ridge = GridSearchCV(
    ridge_reg,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3, verbose=2
)

In [13]:
gs_ridge.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ............................................alpha=0; total time=   0.0s
[CV] END ............................................alpha=0; total time=   0.0s
[CV] END ............................................alpha=0; total time=   0.0s
[CV] END ............................................alpha=5; total time=   0.0s
[CV] END ............................................alpha=5; total time=   0.0s
[CV] END ............................................alpha=5; total time=   0.0s
[CV] END ...........................................alpha=10; total time=   0.0s
[CV] END ...........................................alpha=10; total time=   0.0s
[CV] END ...........................................alpha=10; total time=   0.0s


In [15]:
gs_ridge.best_params_

{'alpha': 10}

In [16]:
ridge_pred = gs_ridge.predict(X_test)
mean_squared_error(ridge_pred, y_test)

131317.5545066945

## Lasso

$$\min_{\hat{\beta}} \sum_{i=1}^n (y_i-\hat{y}_i)^2 + \lambda_{lasso}\sum_{j=1}^k|\hat{\beta}_j|$$

In [17]:
lasso = Lasso()
param_grid = {'alpha': [0, 5, 10, 15, 20, 25, 50, 100]}
gs_lasso = GridSearchCV(lasso,
                        param_grid=param_grid,
                        scoring='neg_mean_squared_error',
                        cv=3)

In [None]:
gs_lasso.fit(X_train, y_train)

In [21]:
lasso_pred = gs_lasso.predict(X_test)

In [22]:
mean_squared_error(lasso_pred, y_test)

131359.4322838416

In [23]:
gs_lasso.best_params_

{'alpha': 0}

## Elastic Net

$$\min_{\hat{\beta}} \sum_{i=1}^n (y_i-\hat{y}_i)^2 + \lambda_{enet}\left(\frac{1-\alpha}{2}\sum_{j=1}^k\hat{\beta}_j^2+\alpha\sum_{j=1}^p|\hat{\beta}_j|\right)$$

In [31]:
en = ElasticNet()
param_grid = {'alpha':[0,1,5, 10],
              'l1_ratio':[0.2, 0.5, 0.9]}
gs_en = GridSearchCV(en, param_grid=param_grid,
                     scoring='neg_mean_squared_error',
                     cv=3,
                     verbose=3)

In [None]:
gs_en.fit(X_train, y_train)
en_pred = gs_en.predict(X_test)
mean_squared_error(en_pred, y_test)

In [33]:
mean_squared_error(en_pred, y_test)

131359.4322838416

## Métricas relevantes para regresión

$$\text{MAE}=\frac{1}{n}\sum_{i=1}^n|y_i-\hat{y}_i|$$
$$\text{MSE}=\frac{1}{n}\sum_{i=1}^n(y_i-\hat{y}_i)^2$$
$$\text{RMSE}=\sqrt{\frac{1}{n}\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$
$$\text{MAPE}=\frac{1}{n}\sum_{i=1}^n\left|\frac{y_i-\hat{y}_i}{y_i}\right|$$