In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler # for SelectKBest with the chi2 (Chi-Square) score function, for feature selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

## Plan:
    1.Load the data,
    
    2.Train/Test/Split,
    
    3.Define numeric/categorical features,
    
    4.Build a ColumnTransformer with Pipelines,
    
    5.Train a model.



In [2]:
df=pd.read_csv("titanic_dataset.csv")
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
766,767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C
408,409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21.0,0,0,312992,7.775,,S
128,129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C
472,473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33.0,1,2,C.A. 34651,27.75,,S
201,202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S


In [3]:
# data cleaning
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

In [4]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [5]:
# train-test-split
X_train, X_test, y_train, y_test=train_test_split(df.drop(columns=['Survived']), df['Survived'], test_size=0.2, random_state=42)

In [6]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [7]:
X_train.shape

(712, 7)

In [8]:
y_train.head()

331    0
733    0
382    0
704    0
813    0
Name: Survived, dtype: int64

In [9]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [10]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [11]:
# Define numeric and categorical features

numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

In [12]:
# Create a single ColumnTransformer that handles all preprocessing, In ColumnTransformer,
# we can provide individual transformers (like SimpleImputer, OneHotEncoder) or an entire Pipelines as transformers. 
# This flexibility allows you to:
# Apply a single transformer directly (simple cases).
# Chain multiple transformers in a Pipeline (complex preprocessing).

# How this works:
# The ColumnTransformer splits the dataset into numerical and categorical columns.
# Applies the respective pipeline to each subset:
# Numerical → Impute → Scale
# Categorical → Impute → One-Hot Encode
#Combines the results into a single transformed dataset.

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

In [13]:
# Create the final pipeline, using the preprocessor columntransfomer (using pipeline) and Decision Tree algorithm to train and predict

DT_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

In [14]:
# train

DT_pipe.fit(X_train,y_train)

In [15]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')

In [16]:
# Predict

y_pred_DT = DT_pipe.predict(X_test)
y_pred_DT

array([0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1])

In [17]:
# accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_DT)

0.7932960893854749

In [18]:
# check using RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Full pipeline
RF_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train and evaluate
RF_pipe.fit(X_train, y_train)

# Predict
y_pred_RF = RF_pipe.predict(X_test)
y_pred_RF

# accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_RF)

0.8156424581005587

## Exporting the Pipeline

In [19]:
#export
import pickle
pickle.dump(RF_pipe,open('pickle.pkl', 'wb'))