A **pipeline** is a sequence of interconnected processing elements or components arranged in a sequence to automate and streamline complex workflows, commonly used in data processing, machine learning, and software development.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv("/content/train (1).csv")

# Drop unnecessary columns
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

# Define features and target variable
X = df.drop('Survived', axis=1)
y = df['Survived']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include="object").columns.tolist()  # ['Sex', 'Embarked']
numerical_features = ['Age', 'Fare']  # Numerical features

# Define individual transformers for preprocessing
trf_numerical = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values in Age with the mean
    ('scaler', MinMaxScaler())  # Scale the numerical features
])

trf_categorical = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values in Embarked with the most frequent
    ('one_hot_encoding', OneHotEncoder(drop='first', handle_unknown='ignore'))  # One-hot encoding
])

# Create the main ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', trf_numerical, numerical_features),
        ('cat', trf_categorical, categorical_features),
    ],
    remainder='drop'  # Drop any remaining columns (none in this case)
)

# Create the full pipeline with a classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))  # Classifier
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Display accuracy on the test set
print("Accuracy on the test set:", accuracy)


Accuracy on the test set: 0.7541899441340782


In [None]:
pipeline

# try it on some unseen data

In [None]:
# Prepare unseen data
unseen_data = pd.DataFrame({
    'Pclass': [1, 3],
    'Sex': ['female', 'male'],
    'Age': [28.0, 30.0],
    'SibSp': [0, 1],
    'Parch': [ 0, 0],
    'Fare': [80.0, 15.0],
    'Embarked': ['C', 'S']
})

# Make predictions on the unseen data
unseen_predictions = pipeline.predict(unseen_data)

# Display predictions for the unseen data
print("Predictions for unseen data:", unseen_predictions)


Predictions for unseen data: [1 0]


# we can save the model

In [None]:
!pip install joblib




In [None]:
import joblib

# Save the pipeline to a file
joblib.dump(pipeline, 'titanic_pipeline.joblib')


['titanic_pipeline.joblib']

In [None]:
# Load the pipeline from the file
loaded_pipeline = joblib.load('titanic_pipeline.joblib')

# Now you can use the loaded pipeline to make predictions
unseen_predictions = loaded_pipeline.predict(unseen_data)
print("Predictions for unseen data using loaded pipeline:", unseen_predictions)


Predictions for unseen data using loaded pipeline: [1 0]
