# Sklearn Pipeline Example
* [Resource](https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf)

In [18]:
## Improrting data
import pandas as pd
train = pd.read_csv(r'.\dataset\train_ctrUa4K.csv')
test = pd.read_csv(r'.\dataset\test_lAUu6dG.csv')
train = train.drop('Loan_ID', axis='columns')
train.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [19]:
# Divide the data into train and test
X = train.drop('Loan_Status', axis=1)
y = train['Loan_Status']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# sanity check
print(f"X_train.shape : {X_train.shape}")
print(f"X_test.shape : {X_test.shape}")
print(f"y_train.shape : {y_train.shape}")
print(f"y_test.shape : {y_test.shape}")


X_train.shape : (491, 11)
X_test.shape : (123, 11)
y_train.shape : (491,)
y_test.shape : (123,)


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')), # impute missing values by column median
        ('scaler', StandardScaler()) # scale values to 0-1 range
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # impute missing values with 'missing'
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [22]:
# get numeric and categorical features
numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
print(f"numeric_features: {numeric_features}")
categorical_features = train.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns
print(f"categorical_features: {categorical_features}")

numeric_features: Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')
categorical_features: Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')


In [23]:
# use column transformer to preprocess different columns
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [24]:
preprocessor.fit_transform(X_train)[:1]

array([[0.08391466, 0.18066742, 1.35547238, 0.28761093, 0.40776315,
        0.        , 1.        , 0.        , 0.        , 1.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 1.        , 0.        ]])

In [25]:
# Creating a random forest classifier and fitting it to data
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier())
    ]
)

rf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [26]:
print(X_train.shape)
# print(len(preds))
print(len(y_test.values))


(491, 11)
123


In [27]:
preds = rf.predict(X_train)
print(preds[:10])
# print(preds, len(preds), len(y_test), y_test)
rf.score(X_test, y_test.values)

['N' 'Y' 'Y' 'Y' 'N' 'N' 'Y' 'N' 'Y' 'Y']


0.7723577235772358

In [28]:
# Using Gridsearch for hyperparameter tuning
param_grid = { 
    'classifier__n_estimators': [200, 500],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth' : [4,5,6,7,8],
    'classifier__criterion' :['gini', 'entropy']
}

from sklearn.model_selection import GridSearchCV
CV = GridSearchCV(rf, param_grid, n_jobs= -1)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)    
print(CV.best_score_)

{'classifier__criterion': 'entropy', 'classifier__max_depth': 6, 'classifier__max_features': 'auto', 'classifier__n_estimators': 200}
0.8145743145743145


In [29]:
# Create a new classifer using the best parameters
# Creating a random forest classifier and fitting it to data
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(criterion='entropy', 
                                              max_depth=7,
                                              max_features='log2',
                                              n_estimators=500))
    ]
)

rf.fit(X_train, y_train)
print(f"Training Score: {rf.score(X_train, y_train)}")
print(f"Testing Score: {rf.score(X_test, y_test)}")


Training Score: 0.8472505091649695
Testing Score: 0.7886178861788617


All the above process can also be done in a different way, using make_pipeline and make_column_transformer, key difference being, we won't need to pass name of the process. More about it can be find [here](https://jorisvandenbossche.github.io/blog/2018/05/28/scikit-learn-columntransformer/).

In [30]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

preprocessor2 = make_column_transformer(
    (
        # numeric_features,
        make_pipeline(
            SimpleImputer(), # left empty, will be used in grid search
            StandardScaler()
        ),
        numeric_features
    ),
    (
        # categorical_features,
        make_pipeline(
            SimpleImputer(strategy='constant', fill_value='missing'),
            OneHotEncoder(handle_unknown='ignore')
        ),
        categorical_features
    )
)

# check one of the preprocessed output
preprocessor2.fit_transform(X_train)[:1]

array([[8.39146643e-02, 1.80667417e-01, 1.34913392e+00, 2.95122167e-01,
        3.19909943e-16, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00]])

In [31]:
# Random Forest Model
rf_model = make_pipeline(
    preprocessor2,
    RandomForestClassifier() # This will clearly overfit
)
rf_model.fit(X_train, y_train)
print(f"Training Score: {rf_model.score(X_train, y_train)}")
print(f"Testing Score: {rf_model.score(X_test, y_test)}")


Training Score: 1.0
Testing Score: 0.7560975609756098


## Using GridSearch in Column Transformer

In [32]:
# create an empty parameter grid 
param_grid = {}
# Get a list of parameters that we can tune
grid_clf = GridSearchCV(rf_model, param_grid, cv=10, iid=False)
for val in grid_clf.estimator.get_params().keys():
    print(val)

memory
steps
verbose
columntransformer
randomforestclassifier
columntransformer__n_jobs
columntransformer__remainder
columntransformer__sparse_threshold
columntransformer__transformer_weights
columntransformer__transformers
columntransformer__verbose
columntransformer__pipeline-1
columntransformer__pipeline-2
columntransformer__pipeline-1__memory
columntransformer__pipeline-1__steps
columntransformer__pipeline-1__verbose
columntransformer__pipeline-1__simpleimputer
columntransformer__pipeline-1__standardscaler
columntransformer__pipeline-1__simpleimputer__add_indicator
columntransformer__pipeline-1__simpleimputer__copy
columntransformer__pipeline-1__simpleimputer__fill_value
columntransformer__pipeline-1__simpleimputer__missing_values
columntransformer__pipeline-1__simpleimputer__strategy
columntransformer__pipeline-1__simpleimputer__verbose
columntransformer__pipeline-1__standardscaler__copy
columntransformer__pipeline-1__standardscaler__with_mean
columntransformer__pipeline-1__standa

In [33]:
# create a parameter grid based on the parameters 
param_grid = {
    'columntransformer__pipeline-1__simpleimputer__strategy': ['mean', 'median'],
    'randomforestclassifier__n_estimators': [200, 500],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2'],
    'randomforestclassifier__max_depth' : [4,5,6,7,8],
    'randomforestclassifier__criterion' :['gini', 'entropy']
    }

In [48]:
grid_clf = GridSearchCV(estimator=rf_model, 
                        param_grid=param_grid, 
                        cv=3, # kfolds = 3
                        n_jobs=-1, # use all processors
                        refit=True, # fit best classifier on train data at end, default=True
                        return_train_score=True)
grid_clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                  

In [49]:
# print best model hyperparameters
print(grid_clf.best_params_)

{'columntransformer__pipeline-1__simpleimputer__strategy': 'mean', 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 7, 'randomforestclassifier__max_features': 'log2', 'randomforestclassifier__n_estimators': 200}


In [50]:
# print parameters of best model
best_model = grid_clf.best_estimator_
print(best_model)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                             

In [51]:
# print accuracy of best model on test data
print("best model from grid search: ", best_model.score(X_test, y_test))


best model from grid search:  0.7886178861788617


In [52]:
# save the best model
import joblib
joblib.dump(best_model, 'best_model.joblib')

['best_model.joblib']

In [53]:
# load saved model
loaded_model = joblib.load('best_model.joblib')

In [54]:
# checking accuracy score via loaded model
print(loaded_model.score(X_test, y_test))

0.7886178861788617


Same as that of best_model. So, SUCCESS!