# **Selecting Best Model in Pipeline**

To select the best model when using multiple models in a pipeline, you can use techniques ike cross_validation and evaluation metrics to compare their performance. Here's an example of how to accomplish this on the titanic dataset;

In [1]:
#import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# load the titan dataset
df = sns.load_dataset("titanic")

# split the data into features and target
x = df.drop("survived", axis=1)
y = df['survived']

# split data into train and test set
x_train, x_test, y_train, y_test = train_test_split(x , y, test_size=0.2, random_state=42)

# create the list of model to evaluate
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
] 

best_model = None
best_accuracy = 0.0

# iterate over the models and evaluate their performance
for name, model in models:
    # create a pipeline fro each model
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])

    # perform cross-validation
    scores = cross_val_score(pipeline, x_train, y_train, cv = 5)

    # calculate mean accuracy
    mean_accuracy = scores.mean() 

    # fit the pipeline on the training data
    pipeline.fit(x_train, y_train)

    # make predictions on the test data
    y_pred = pipeline.predict(x_test)

    # calclate the accuracy 
    accuracy = accuracy_score(y_test, y_pred)

    # print the performance metrics
    print("Model:", name)
    print('Cross_validation ', mean_accuracy )
    print('Test Accuracy', accuracy)
    print()

    #check if the current model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

#retrive the best model
print("best Model ", best_model)

Model: Random Forest
Cross_validation  1.0
Test Accuracy 1.0

Model: Gradient Boosting
Cross_validation  1.0
Test Accuracy 1.0

best Model  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
