In [1]:
import numpy as np
import pandas as pd
import warnings
import math
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LinearRegression

In [22]:
x1 = np.linspace(1,100,10)
x2 = np.linspace(1,100,10)
y = x1 + 4 * np.cbrt(x2)#[9,15,18,14,18,13,24,32,41]


data = pd.DataFrame({'x1': x1,'x2':x2, 'y':y})


In [23]:
df = pd.DataFrame(data)

In [24]:
df.head()

Unnamed: 0,x1,x2,y
0,1.0,1.0,5.0
1,12.0,12.0,21.157714
2,23.0,23.0,34.375468
3,34.0,34.0,46.958447
4,45.0,45.0,59.227573


In [28]:
# Separamos en train y test

train = df.iloc[:7]
test = df.iloc[7:]

train_X = train.drop('y', axis=1)
train_y = train.y

test_X = test.drop('y', axis=1)
test_y = test.y

In [29]:
test_X

Unnamed: 0,x1,x2
7,78.0,78.0
8,89.0,89.0
9,100.0,100.0


In [30]:
#  verificacmos si la regresión lineal puede predecir correctamente

m1 = LinearRegression()
fit1 = m1.fit(train_X, train_y)
preds = fit1.predict(test_X)
print(f"\n{preds}")
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds))}\n")


[ 97.30728538 110.16021944 123.0131535 ]
RMSE: 3.4441125292015107



In [31]:
# Las predicciones no son malas, pero hagamos algunos cálculos sobre las características de entrada para mejorar
# ¿Qué pasa si sacamos la raíz cuadrada de X2 y multiplicamos por 2?

train_X.x2 = 4 * np.sqrt(train_X.x2)

test_X.x2 = 4 * np.sqrt(test_X.x2)

print(test_X)

m2 = LinearRegression()

fit2 = m2.fit(train_X, train_y)

preds = fit2.predict(test_X)

print(f"\nPred:{preds}")

print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds))}\n")

      x1         x2
7   78.0  35.327043
8   89.0  37.735925
9  100.0  40.000000

Pred:[ 94.9049311  106.52812806 118.06620819]
RMSE: 0.36244368054867443



In [32]:
# Restauración

train = df.iloc[:7]
test = df.iloc[7:]

train_X = train.drop('y', axis=1)
train_y = train.y

test_X = test.drop('y', axis=1)
test_y = test.y

In [33]:
# Creación de clase transformadora

class ExperimentalTransformer(BaseEstimator, TransformerMixin):
  def __init__(self):
    print('\n>>>>>>>init() called.\n')

  def fit(self, X, y = None):
    print('\n>>>>>>>fit() called.\n')
    return self

  def transform(self, X, y = None):
    print('\n>>>>>>>transform() called.\n')
    X_ = X.copy() 
    X_.x2 = 4 * np.sqrt(X_.x2)
    return X_

In [34]:
# Prueba pipeline sin transformación


print("create pipeline 1")
pipe1 = Pipeline(steps=[
                       ('linear_model', LinearRegression())
])

print("fit pipeline 1")
pipe1.fit(train_X, train_y)

print("predict via pipeline 1")
preds1 = pipe1.predict(test_X)

print(f"\n{preds1}")  # should be [13.72113586 16.93334467]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds1))}\n")

create pipeline 1
fit pipeline 1
predict via pipeline 1

[ 97.30728538 110.16021944 123.0131535 ]
RMSE: 3.4441125292015107



In [35]:
# Prueba pipeline con transformación
print("create pipeline 2")
pipe2 = Pipeline(steps=[
                       ('experimental_trans', ExperimentalTransformer()),    # esto dispara una llamda a __init__
                       ('linear_model', LinearRegression())
])


print("fit pipeline 2")
pipe2.fit(train_X, train_y)

print("predict via pipeline 2")
preds2 = pipe2.predict(test_X)

print(f"\n{preds2}")  # should be [14. 17.]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds2))}\n")



create pipeline 2

>>>>>>>init() called.

fit pipeline 2

>>>>>>>fit() called.


>>>>>>>transform() called.

predict via pipeline 2

>>>>>>>transform() called.


[ 94.9049311  106.52812806 118.06620819]
RMSE: 0.36244368054867443

