PIPELINE

In [4]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer

In [2]:
df_wine= pd.read_csv('data/Red.csv')
df_wine.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016


In [3]:
df_wine_test= pd.read_csv('data/Red_test.csv')
df_wine_test.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017
1,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015
2,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019
3,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016
4,Capatosta 2015,Italy,Toscana,Poggio Argentiera,3.8,101,19.9,2015


In [5]:
ct = make_column_transformer(
    (OrdinalEncoder(), ['Region']),
    (StandardScaler(), ['Price']),
    (OneHotEncoder(), ['Country']))
print(ct)

ColumnTransformer(transformers=[('ordinalencoder', OrdinalEncoder(),
                                 ['Region']),
                                ('standardscaler', StandardScaler(), ['Price']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Country'])])


In [6]:
pipeline = Pipeline([('ct', ct), ('rf', RandomForestRegressor(random_state=42))])

In [8]:
X_train = df_wine[['Region','Country', 'Price']]
y_train = df_wine['Rating']

In [9]:
X_test = df_wine_test[['Region','Country', 'Price']]
y_test = df_wine_test['Rating']

In [10]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print('RMSE test: {:.4f}'.format(mean_squared_error(y_test, y_pred, squared=False)))

RMSE test: 0.0765


In [12]:
pipeline.set_params(rf__n_estimators=200)

In [15]:
pipeline.get_params()

{'memory': None,
 'steps': [('ct',
   ColumnTransformer(transformers=[('ordinalencoder', OrdinalEncoder(),
                                    ['Region']),
                                   ('standardscaler', StandardScaler(), ['Price']),
                                   ('onehotencoder', OneHotEncoder(),
                                    ['Country'])])),
  ('rf', RandomForestRegressor(n_estimators=200, random_state=42))],
 'verbose': False,
 'ct': ColumnTransformer(transformers=[('ordinalencoder', OrdinalEncoder(),
                                  ['Region']),
                                 ('standardscaler', StandardScaler(), ['Price']),
                                 ('onehotencoder', OneHotEncoder(),
                                  ['Country'])]),
 'rf': RandomForestRegressor(n_estimators=200, random_state=42),
 'ct__n_jobs': None,
 'ct__remainder': 'drop',
 'ct__sparse_threshold': 0.3,
 'ct__transformer_weights': None,
 'ct__transformers': [('ordinalencoder', OrdinalEn

In [13]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print('RMSE test: {:.4f}'.format(mean_squared_error(y_test, y_pred, squared=False)))

RMSE test: 0.0761


In [17]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.tree import DecisionTreeRegressor

In [19]:
#Создаем список кортежей вида: (наименование модели, модель)
estimators = [
    ('rr', RidgeCV()),
    ('dt',  DecisionTreeRegressor(random_state=42))
]

In [24]:
# Создаем объект класса стекинг
reg = StackingRegressor(
    estimators=estimators, #базовые модели в стекинге
    final_estimator=RandomForestRegressor( #метамодель  
        n_estimators=10,             
        random_state=42 #датчик генератора случайных чисел
    )
)

In [25]:
pipeline_2 = Pipeline([('ct', ct), ('reg', reg)])

In [26]:
pipeline_2.fit(X_train, y_train)
y_pred_2 = pipeline_2.predict(X_test)
print('RMSE test: {:.2f}'.format(mean_squared_error(y_test, y_pred_2, squared=False)))

RMSE test: 0.18
