In [2]:
import numpy as np
import pandas as pd

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report

In [4]:
df=pd.read_csv('tested.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S


In [6]:
df.isna().sum()

Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [7]:
X=df.drop('Survived',axis=1)
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [8]:
y=df['Survived']
y.head()

0    0
1    1
2    0
3    0
4    1
Name: Survived, dtype: int64

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
336,2,male,32.0,0,0,13.0000,S
31,2,male,24.0,2,0,31.5000,S
84,2,male,,0,0,10.7083,Q
287,1,male,24.0,1,0,82.2667,S
317,2,male,19.0,0,0,10.5000,S
...,...,...,...,...,...,...,...
71,3,male,21.0,0,0,7.8958,S
106,3,male,21.0,0,0,7.8208,Q
270,1,male,46.0,0,0,75.2417,C
348,2,male,24.0,0,0,13.5000,S


In [10]:
X_train.isna().sum()

Pclass       0
Sex          0
Age         72
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [11]:
# imputation transformer
trf1=ColumnTransformer([
    ('si_age_fare',SimpleImputer(),[2,5])
],remainder='passthrough')

In [12]:
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [13]:
# scaling
trf3=ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
],remainder='passthrough')

In [14]:
# feature selection
trf4=SelectKBest(score_func=chi2,k=8)

In [15]:
# training model
trf5=DecisionTreeClassifier()

#### Create pipeline

In [16]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

#### Pipeline Vs make_pipeline

Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

In [17]:
# Alternate Syntax
pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [18]:
pipe

In [19]:
trf=ColumnTransformer(
    transformers=[
    ('si_age_fare',SimpleImputer(),[2,5]),
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6]),
    ('scale',MinMaxScaler(),slice(0,10))
],remainder='passthrough')

In [20]:
trf

In [23]:
# Assuming you have X and y ready (your features and target variable)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('si_age_fare', SimpleImputer(), [2, 5]),  # Impute missing values in Age and Fare
        ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1, 6]),  # OneHotEncode Sex and Embarked
        ('scale', MinMaxScaler(), [0, 3, 4])  # Scale Pclass, SibSp, Parch columns
    ], remainder='passthrough'  # Leave other columns as is (if any)
)

# Build the pipeline with preprocessing and model
pipe = Pipeline([
    ('preprocessor', preprocessor),  # Apply the preprocessing pipeline
    ('feature_selection', SelectKBest(score_func=chi2, k=8)),  # Select top 8 features
    ('classifier', DecisionTreeClassifier())  # Model
])

# Train the pipeline
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Optionally, print or evaluate the performance
print(y_pred)


[0 1 0 0 1 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 1 1 0 1 0 1
 0 1 0 1 1 0 0 0 0 1 1 1 0 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1
 0 1 0 1 1 0 0 1 0 1]


In [24]:
pipe

In [32]:
# predict
y_pred=pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [34]:
y_pred_train=pipe.predict(X_train)
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       216
           1       1.00      1.00      1.00       118

    accuracy                           1.00       334
   macro avg       1.00      1.00      1.00       334
weighted avg       1.00      1.00      1.00       334



In [35]:
import pickle

In [38]:
pickle.dump(pipe,open('pipe.pkl','wb'))