# `Pipelining`:
In ml, a pipeline is a sequence of data processing steps that are chained together to automated and streamline the ml workflow. A pipeline allows you to combined multiple data preprocessing and model training steps into a single object, making it easier to organize and manage your ml code.

1. preprocessing
2. model training
3. model evaluation
4. prediction

# `Advantage`
1. Simplified workflow
2. Avoiding Data Leakage
3. Streamlined modal deployment
4. Hyperparameter Tuning

`model contain or information retain in pipeline`

# `Summary`:

Overall, pipeline are a powerful tool for managing and automating the machine learning workflow, promoting code reusability, consistency, efficiency. They help streamline the development and deployment of ml models, making it easier to iterate and expierment with different approaches.

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# load the titanic datasetfrom seaborn 
titanaic_data= sns.load_dataset('titanic')

# Select features and target variables
X= titanaic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]
y= titanaic_data['survived']

# Select the data into train and test sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer for imputing missing values
numeric_features=['age', 'fare']
categorical_features= ['pclass', 'sex', 'embarked']

numeric_transformer= Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer= Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor= ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with the preprocessor and RandomForestClassifier
pipeline= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make prediction on the test data
y_pred= pipeline.predict(X_test)

# Calculate the accuracy score
accuracy= accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.7821229050279329


# `Hyperparameter Tuning in Pipeline:`
 in a pipeline involves optimizing the hyperparameters of the different steps in the pipeline to find the best combination that maximizes the model's performance. Here's an example of hyperparameter tuning in a pipeline and selecting the best modell ont he titanic datasets.

In [10]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# load the titanic datasetfrom seaborn 
titanaic_data= sns.load_dataset('titanic')

# Select features and target variables
X= titanaic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]
y= titanaic_data['survived']

# Select the data into train and test sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

# Create the pipeline
pipeline= Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    ('model', RandomForestClassifier(random_state=42))
])

# Define the hyperparameters to tune (diff param in same model we tune and deal the things)
hyperparameters = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 5, 10],
    'model__min_samples_split': [2, 5, 10]
}

# perform GridSearchCV:
grid_search= GridSearchCV(pipeline, hyperparameters, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model
best_model= grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred= best_model.predict(X_test)

# Calculate the best hyperperameters
accuracy= accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# print the best hyperparameters
print('Best Hyperparameters:', grid_search.best_params_)

Accuracy: 0.8212290502793296
Best Hyperparameters: {'model__max_depth': None, 'model__min_samples_split': 5, 'model__n_estimators': 300}


# Select the best model in Pipeline:
To select the best model when using multiple models in a pipeline, you can use techniques like cross-validation and evaluation metrics to compare their performance. Here's an example of how to accomplish this on the titanic datasets:

In [17]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

df_titanic = sns.load_dataset('titanic')

# Select the feature and target
X = df_titanic[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = df_titanic['survived']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a list of models to evaluate
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boost', GradientBoostingClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42))
]

best_model = None
best_accuracy = 0.0

# Iterate over the models and evaluate their performance 
for name, model in models:
    # Create a pipeline for each model
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])

    # Perform cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)

    # Calculate mean accuracy
    mean_accuracy = scores.mean()

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = pipeline.predict(X_test)

    # Calculate the accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Print the performance metrics
    print('Model:', name)
    print('Cross Validation Accuracy:', mean_accuracy)
    print('Test Accuracy:', accuracy)
    print()

    # Check if the current model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

# Retrieve the best model
print('Best Model:', best_model)

Model: Random Forest
Cross Validation Accuracy: 0.7991529597163399
Test Accuracy: 0.8379888268156425

Model: Gradient Boost
Cross Validation Accuracy: 0.8061952132374668
Test Accuracy: 0.7988826815642458

Model: XGBoost
Cross Validation Accuracy: 0.8034177090515119
Test Accuracy: 0.7932960893854749

Best Model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])


In [19]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

df_titanic = sns.load_dataset('titanic')

# Select the feature and target
X = df_titanic[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = df_titanic['survived']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a list of models to evaluate
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boost', GradientBoostingClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42)),
    ('Decision tree', DecisionTreeClassifier(random_state=42)),
    ('SVC', SVC(random_state=42)),
    ('Linear Regression', LogisticRegression(random_state=42))
]

best_model = None
best_accuracy = 0.0

# Iterate over the models and evaluate their performance 
for name, model in models:
    # Create a pipeline for each model
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])

    # Perform cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)

    # Calculate mean accuracy
    mean_accuracy = scores.mean()

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = pipeline.predict(X_test)

    # Calculate the accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Print the performance metrics
    print('Model:', name)
    print('Cross Validation Accuracy:', mean_accuracy)
    print('Test Accuracy:', accuracy)
    print()

    # Check if the current model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

# Retrieve the best model
print('Best Model:', best_model)

Model: Random Forest
Cross Validation Accuracy: 0.7991529597163399
Test Accuracy: 0.8379888268156425

Model: Gradient Boost
Cross Validation Accuracy: 0.8061952132374668
Test Accuracy: 0.7988826815642458

Model: XGBoost
Cross Validation Accuracy: 0.8034177090515119
Test Accuracy: 0.7932960893854749

Model: Decision tree
Cross Validation Accuracy: 0.7865655471289273
Test Accuracy: 0.8268156424581006

Model: SVC
Cross Validation Accuracy: 0.8160248202501723
Test Accuracy: 0.8044692737430168

Model: Linear Regression
Cross Validation Accuracy: 0.7977839062346105
Test Accuracy: 0.8100558659217877

Best Model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
