<a href="https://colab.research.google.com/github/taufiqbashori/for_references/blob/main/Scikit_Learn_Pipeline_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scikit Learn Pipeline Tutorial

This tutorial is taken from this video https://www.youtube.com/watch?v=xIqX1dqcNbY

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('sample_data/california_housing_train.csv')
test_df = pd.read_csv('sample_data/california_housing_test.csv')

train_df.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [4]:
X_train, y_train = train_df.to_numpy()[:,:-1], train_df.to_numpy()[:,-1]
X_test, y_test = test_df.to_numpy()[:,:-1], test_df.to_numpy()[:,-1]

X_train.shape, y_train.shape, X_test.shape, y_train.shape

((17000, 8), (17000,), (3000, 8), (17000,))

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from copy import deepcopy

std_scaler = StandardScaler().fit(X_train[:, :2])
min_max_scaler = MinMaxScaler().fit(X_train[:, 2:])

def preprocessor(X):
  A = np.copy(X)
  A[:, :2] = std_scaler.transform(X[:,:2])
  A[:, 2:] = min_max_scaler.transform(X[:,2:])
  return A

In [12]:
preprocessor(X_test)
X_test

array([[-122.05  ,   37.37  ,   27.    , ..., 1537.    ,  606.    ,
           6.6085],
       [-118.3   ,   34.26  ,   43.    , ...,  809.    ,  277.    ,
           3.599 ],
       [-117.81  ,   33.78  ,   27.    , ..., 1484.    ,  495.    ,
           5.7934],
       ...,
       [-119.7   ,   36.3   ,   10.    , ...,  693.    ,  220.    ,
           2.2895],
       [-117.12  ,   34.1   ,   40.    , ...,   46.    ,   14.    ,
           3.2708],
       [-119.63  ,   34.42  ,   42.    , ...,  753.    ,  260.    ,
           8.5608]])

In [14]:
preprocess_transformer = FunctionTransformer(preprocessor)
preprocess_transformer

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

p1 = Pipeline([('Scaler', preprocess_transformer), 
               ('Linear Regression', LinearRegression())])
p1

In [18]:
from sklearn.metrics import mean_absolute_error

def fit_and_print(p, X_train = X_train, y_train=y_train, X_test=X_test, y_test = y_test):
  p.fit(X_train, y_train)
  train_preds = p.predict(X_train)
  test_preds = p.predict(X_test)
  print ('Training error:' + str(mean_absolute_error(train_preds, y_train)))
  print ('Test error:' + str(mean_absolute_error(test_preds, y_test)))

In [19]:
fit_and_print(p1)

Training error:50795.857117863714
Test error:50352.228257942894


In [22]:
from sklearn.neighbors import KNeighborsRegressor as KNR

p2 = Pipeline([('Scaler', preprocess_transformer), 
               ('KNN Regression', KNR(n_neighbors = 7))])
fit_and_print(p2)

Training error:30045.80900840336
Test error:35865.41276190476


In [25]:
from sklearn.ensemble import RandomForestRegressor as RF

p3 = Pipeline([('Scaler', preprocess_transformer), 
               ('KNN Regression', RF(n_estimators = 10, max_depth = 7))])
fit_and_print(p3)

Training error:42033.751122728194
Test error:44968.94590536247
