In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import statsmodels as sm
import sklearn as sk
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error
from feature_engine.encoding import OneHotEncoder, OrdinalEncoder
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn import set_config
set_config(display='diagram')


In [101]:
dummy_data = pd.DataFrame(np.random.randn(8,4))
dummy_data

Unnamed: 0,0,1,2,3
0,-0.938669,-0.817312,0.110687,-0.652692
1,0.669932,-0.255465,-1.861435,0.04801
2,0.314635,-0.581389,0.856725,-0.967931
3,1.515974,0.480953,-1.705045,0.013319
4,-0.996548,0.350242,-0.607316,1.239806
5,1.540909,1.262496,-0.411315,0.889306
6,-0.051422,-0.166656,-0.188734,0.06996
7,0.728771,-0.206037,0.703645,-1.290878


In [102]:
# Holdout: Proceso inestable porque puede variar... eso no es bueno. Dependiendo del split, podemos tener resultados distintos.

X_train, X_val = train_test_split(dummy_data, test_size=0.2, random_state=42)
X_train.shape, X_val.shape

((6, 4), (2, 4))

In [103]:
X_train

Unnamed: 0,0,1,2,3
0,-0.938669,-0.817312,0.110687,-0.652692
7,0.728771,-0.206037,0.703645,-1.290878
2,0.314635,-0.581389,0.856725,-0.967931
4,-0.996548,0.350242,-0.607316,1.239806
3,1.515974,0.480953,-1.705045,0.013319
6,-0.051422,-0.166656,-0.188734,0.06996


In [104]:
X_val

Unnamed: 0,0,1,2,3
1,0.669932,-0.255465,-1.861435,0.04801
5,1.540909,1.262496,-0.411315,0.889306


> Quien me asegura que este split funciona?

> El holdout es muy útil cuando tengo muuuuuuuuuuuuucha data...

## KFold

In [105]:
kf = KFold(n_splits=4) # lo que hace es darme los índices de entrenamiento y validación... Proceso estándar.
for fold, (train_idx, val_idx) in enumerate(kf.split(dummy_data), start= 1):
    print(f'Validation fold: {fold}')
    print()
    print(f'Data Entrenamiento: {dummy_data.iloc[train_idx]}')
    print()
    print(f'Data Validación: {dummy_data.iloc[val_idx]}')
    print('\n')
    
# cross_val_score...


Validation fold: 1

Data Entrenamiento:           0         1         2         3
2  0.314635 -0.581389  0.856725 -0.967931
3  1.515974  0.480953 -1.705045  0.013319
4 -0.996548  0.350242 -0.607316  1.239806
5  1.540909  1.262496 -0.411315  0.889306
6 -0.051422 -0.166656 -0.188734  0.069960
7  0.728771 -0.206037  0.703645 -1.290878

Data Validación:           0         1         2         3
0 -0.938669 -0.817312  0.110687 -0.652692
1  0.669932 -0.255465 -1.861435  0.048010


Validation fold: 2

Data Entrenamiento:           0         1         2         3
0 -0.938669 -0.817312  0.110687 -0.652692
1  0.669932 -0.255465 -1.861435  0.048010
4 -0.996548  0.350242 -0.607316  1.239806
5  1.540909  1.262496 -0.411315  0.889306
6 -0.051422 -0.166656 -0.188734  0.069960
7  0.728771 -0.206037  0.703645 -1.290878

Data Validación:           0         1         2         3
2  0.314635 -0.581389  0.856725 -0.967931
3  1.515974  0.480953 -1.705045  0.013319


Validation fold: 3

Data Entrenamiento: 

In [106]:
loo = LeaveOneOut() #se utiliza para base de 300 filas máximo.
for fold, (train_idx, val_idx) in enumerate(loo.split(dummy_data), start= 1):
    print(f'Validation fold: {fold}')
    print()
    print(f'Data Entrenamiento: {dummy_data.iloc[train_idx]}')
    print()
    print(f'Data Validación: {dummy_data.iloc[val_idx]}')
    print('\n')
        
# cross_val_score...

Validation fold: 1

Data Entrenamiento:           0         1         2         3
1  0.669932 -0.255465 -1.861435  0.048010
2  0.314635 -0.581389  0.856725 -0.967931
3  1.515974  0.480953 -1.705045  0.013319
4 -0.996548  0.350242 -0.607316  1.239806
5  1.540909  1.262496 -0.411315  0.889306
6 -0.051422 -0.166656 -0.188734  0.069960
7  0.728771 -0.206037  0.703645 -1.290878

Data Validación:           0         1         2         3
0 -0.938669 -0.817312  0.110687 -0.652692


Validation fold: 2

Data Entrenamiento:           0         1         2         3
0 -0.938669 -0.817312  0.110687 -0.652692
2  0.314635 -0.581389  0.856725 -0.967931
3  1.515974  0.480953 -1.705045  0.013319
4 -0.996548  0.350242 -0.607316  1.239806
5  1.540909  1.262496 -0.411315  0.889306
6 -0.051422 -0.166656 -0.188734  0.069960
7  0.728771 -0.206037  0.703645 -1.290878

Data Validación:           0         1         2        3
1  0.669932 -0.255465 -1.861435  0.04801


Validation fold: 3

Data Entrenamiento:   

In [107]:
def houldout_train(random_state = 42):
    df = pd.read_csv('../Clase_2022_07_07/mtcars.csv', index_col = 0)

    y = df.mpg 
    X = df.drop(columns = 'mpg')

    X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, random_state=random_state)

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    y_pred_train = lr.predict(X_train)
    y_pred = lr.predict(X_val)

    print(f'RMSE Entrenamiento: {mean_squared_error(y_train, y_pred_train, squared=False)}')
    print(f'RMSE Validación: {mean_squared_error(y_val, y_pred, squared=False)}')

houldout_train()

RMSE Entrenamiento: 2.0556595451475492
RMSE Validación: 3.182790390174519


In [108]:
houldout_train(24)

RMSE Entrenamiento: 1.9652043190055988
RMSE Validación: 3.728928472065335


In [109]:
houldout_train(78)

RMSE Entrenamiento: 1.9150043704276754
RMSE Validación: 5.220103196276678


## KFold:

In [110]:
df = pd.read_csv('../Clase_2022_07_07/mtcars.csv', index_col=0)
X = df.drop(columns = 'mpg')
y = df.mpg

lr = LinearRegression()
cvs = -cross_val_score(lr, X, y, cv=5, scoring='neg_mean_squared_error')
cvs.mean()

16.17083679057401

![sc1](sc1.png)
![sc2](sc2.png)

# LOO

In [111]:
df = pd.read_csv('../Clase_2022_07_07/mtcars.csv', index_col=0)
X = df.drop(columns='mpg')
y = df.mpg

lr = LinearRegression()
cvs = -cross_val_score(lr, X, y, cv=LeaveOneOut(), scoring='neg_mean_squared_error')
cvs.mean()


12.181558006901954

Los modelos son iguales!!! El metodo de validación se elige y se deja así hasta el final.

In [112]:
# Mejoremos el modelo

pl = SklearnTransformerWrapper(PolynomialFeatures(degree=2, interaction_only = True, include_bias = False)) # Una manera de combinar variables más rapido.
pl.fit_transform(df[['hp','am']])

Unnamed: 0,hp,am,hp.1,am.1,hp am
Mazda RX4,110,1,110.0,1.0,110.0
Mazda RX4 Wag,110,1,110.0,1.0,110.0
Datsun 710,93,1,93.0,1.0,93.0
Hornet 4 Drive,110,0,110.0,0.0,0.0
Hornet Sportabout,175,0,175.0,0.0,0.0
Valiant,105,0,105.0,0.0,0.0
Duster 360,245,0,245.0,0.0,0.0
Merc 240D,62,0,62.0,0.0,0.0
Merc 230,95,0,95.0,0.0,0.0
Merc 280,123,0,123.0,0.0,0.0


In [113]:

pipe = Pipeline(steps = [
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('sc', StandardScaler()),
    ('lr', LinearRegression())])

(-cross_val_score(pipe, X, y, cv=LeaveOneOut(), scoring='neg_mean_squared_error')).mean()

531.1480210884931

In [114]:
pipe = Pipeline(steps = [
    ('poly', PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
    ('sc', StandardScaler()),
    ('lr', LinearRegression())])

(-cross_val_score(pipe, X, y, cv=LeaveOneOut(), scoring='neg_mean_squared_error')).mean()

635.459420766536

# Titanic 

In [115]:
df = pd.read_csv('../Clase_2022_06_07/titanic.csv')

X = df[['Pclass','Sex','Age','Fare','Embarked']]
y = df.Survived

X.dtypes

Pclass        int64
Sex          object
Age         float64
Fare        float64
Embarked     object
dtype: object

In [116]:
pipe = Pipeline(steps=[
    ('ci',CategoricalImputer(imputation_method='frequent')), # imputar por la mode
    ('mmi', MeanMedianImputer(imputation_method='mean')), # imputar por la media... variables numéricas
    ('ohe',OrdinalEncoder()),
    ('pl', SklearnTransformerWrapper(PolynomialFeatures(degree=2, interaction_only=True, include_bias=False ), variables = ['Pclass','Sex'])),
    ('sc', SklearnTransformerWrapper(StandardScaler(), variables = ['Age','Fare'])),
    ('lr', LogisticRegression(random_state=42))
    ])
display(pipe)
model1 = (cross_val_score(pipe, X, y, cv=5, scoring='accuracy')).mean()   #  0.7855 con houldout. LR es de alto sesgo (Sin embargo es un modelo super estable)


In [117]:
pipe = Pipeline(steps=[
    ('ci',CategoricalImputer(imputation_method='frequent')), # imputar por la mode
    ('mmi', MeanMedianImputer(imputation_method='median')), # imputar por la media... variables numéricas
    ('ohe',OrdinalEncoder(encoding_method= 'arbitrary')),
    ('pl', SklearnTransformerWrapper(PolynomialFeatures(degree=2, interaction_only=True, include_bias=False ), variables = ['Pclass','Sex'])),
    ('sc', SklearnTransformerWrapper(StandardScaler(), variables = ['Age','Fare'])),
    ('lr', LogisticRegression(random_state=42))
    ])
display(pipe)
model2 = (cross_val_score(pipe, X, y, cv=5, scoring='accuracy')).mean()   #  0.7855 con houldout. LR es de alto sesgo (Sin embargo es un modelo super estable)

In [118]:
pipe = Pipeline(steps=[
    ('ci',CategoricalImputer(imputation_method='missing')), # imputar por la mode
    ('mmi', MeanMedianImputer(imputation_method='median')), # imputar por la media... variables numéricas
    ('ohe',OrdinalEncoder(encoding_method= 'arbitrary')),
    ('pl', SklearnTransformerWrapper(PolynomialFeatures(degree=2, interaction_only=True, include_bias=False ), variables = ['Pclass','Sex'])),
    ('sc', SklearnTransformerWrapper(StandardScaler(), variables = ['Age','Fare'])),
    ('lr', LogisticRegression(random_state=42))
    ])
display(pipe)
model3 = (cross_val_score(pipe, X, y, cv=5, scoring='accuracy')).mean()   #  0.7855 con houldout. LR es de alto sesgo (Sin embargo es un modelo super estable)

In [119]:
pipe = Pipeline(steps=[
    ('ci',CategoricalImputer(imputation_method='missing')), # imputar por la mode
    ('mmi', MeanMedianImputer(imputation_method='median')), # imputar por la media... variables numéricas
    ('ohe',OrdinalEncoder(encoding_method= 'arbitrary')),
    ('pl', SklearnTransformerWrapper(PolynomialFeatures(degree=2, interaction_only=True, include_bias=False ), variables = ['Pclass','Sex','Age','Fare'])),
    ('sc', StandardScaler()),
    ('lr', LogisticRegression(random_state=42))
    ])
display(pipe)
model4 = (cross_val_score(pipe, X, y, cv=5, scoring='accuracy')).mean()   #  0.7855 con houldout. LR es de alto sesgo (Sin embargo es un modelo super estable)

In [120]:
[model1, model2,  model3,  model4]

[0.7890716213671458,
 0.7958006402611261,
 0.7958006402611261,
 0.8092272926997677]

In [122]:
## Hiperparametros: es cualquier valor que el modelo no aprende sino que yo se lo doy...
# Hyperparameter Tuning