<br>

# Sk-learn Pipeline:

<br>


# Model Without Pipeline: 


In [51]:


import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer

In [52]:

df = pd.read_csv("dataset/train.csv")
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
102,103,0,1,"White, Mr. Richard Frasar",male,21.0,0,1,35281,77.2875,D26,S
298,299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S
278,279,0,3,"Rice, Master. Eric",male,7.0,4,1,382652,29.125,,Q
201,202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S
794,795,0,3,"Dantcheff, Mr. Ristiu",male,25.0,0,0,349203,7.8958,,S


In [53]:

df.drop(columns=["PassengerId","Name","Ticket","Cabin"],inplace=True)
df.sample(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
277,0,2,male,,0,0,0.0,S
268,1,1,female,58.0,0,1,153.4625,S


In [54]:
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,1:],df["Survived"],test_size=0.3)

print(f"X_train : {X_train.shape}")
print(f"X_test : {X_test.shape}")
print(f"y_train : {y_train.shape}")
print(f"y_test : {y_test.shape}")


X_train : (623, 7)
X_test : (268, 7)
y_train : (623,)
y_test : (268,)


In [55]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 623 entries, 57 to 539
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    623 non-null    int64  
 1   Sex       623 non-null    object 
 2   Age       514 non-null    float64
 3   SibSp     623 non-null    int64  
 4   Parch     623 non-null    int64  
 5   Fare      623 non-null    float64
 6   Embarked  621 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 38.9+ KB


In [56]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         109
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

## We can't use column transformer efficiently if we don't know about sk-learn pipeline: 

`Here, it will give us error: `

- `In tnf2, we use the embarked column, and then the embaked turn into a array . When we use tnf3 for onehotEncoder to convert categorical data into it will not work:`


In [57]:

# in OneHotEncoder(handle_unknown="ignore") when we empty value in ["sex" and "embarked"]
# we toatally ignore this when our model get nan value while deypoling:


transformer = ColumnTransformer(
    transformers=[
        ("tnf1",SimpleImputer(strategy="mean"),["Age"]),
        ("tnf2",SimpleImputer(strategy="most_frequent"),["Embarked"]),
        ("tnf3",OneHotEncoder(sparse_output=False,drop="first",handle_unknown='ignore'),["Sex","Embarked"])
    ],
    remainder="passthrough",
)

In [58]:
X_train_transformed = transformer.fit_transform(X_train)

X_train_transformed

array([[28.5, 'C', 1.0, ..., 0.0, 0.0, 7.2292],
       [30.135389105058366, 'S', 1.0, ..., 1.0, 0.0, 19.9667],
       [16.0, 'S', 0.0, ..., 0.0, 0.0, 86.5],
       ...,
       [40.0, 'S', 0.0, ..., 0.0, 0.0, 153.4625],
       [34.0, 'S', 1.0, ..., 1.0, 0.0, 26.0],
       [22.0, 'C', 0.0, ..., 0.0, 2.0, 49.5]], dtype=object)

---

<br>
<br>
<br>


# Do all the process without using column transformer :


<br>
<br>
<br>

--- 


In [59]:
X_train.sample(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
708,1,female,22.0,0,0,151.55,S
764,3,male,16.0,0,0,7.775,S


In [60]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         109
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [61]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 623 entries, 57 to 539
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    623 non-null    int64  
 1   Sex       623 non-null    object 
 2   Age       514 non-null    float64
 3   SibSp     623 non-null    int64  
 4   Parch     623 non-null    int64  
 5   Fare      623 non-null    float64
 6   Embarked  621 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 38.9+ KB



<br>

#### Remove missing value:

<br>


In [62]:
from sklearn.impute import SimpleImputer

si_age = SimpleImputer()
si_embarked =  SimpleImputer(strategy="most_frequent")

X_train_age = si_age.fit_transform(X_train[["Age"]])
X_test_age = si_age.transform(X_test[["Age"]])

X_train_Embarked = si_embarked.fit_transform(X_train[["Embarked"]])
X_test_Embarked = si_embarked.transform(X_test[["Embarked"]])



<br>

#### apply one hot encoding:

<br>


In [63]:

# after depoling model: if we give dataset where we have null value in sex then it will be ignored
one_sex = OneHotEncoder(drop="first",sparse_output=False,handle_unknown="ignore") 
one_embarked = OneHotEncoder(drop="first",sparse_output=False,handle_unknown='ignore')

X_train_Sex = one_sex.fit_transform(X_train[["Sex"]])
X_test_sex = one_sex.transform(X_test[["Sex"]])

X_train_one_Embarked = one_embarked.fit_transform(X_train_Embarked)
X_test_one_Embarked = one_embarked.transform(X_test_Embarked)



In [67]:

X_train_rem = X_train.drop(columns=["Sex","Embarked","Age"])
X_test_rem = X_test.drop(columns=["Sex","Embarked","Age"])



In [65]:
print(X_test_age.shape)
print(X_test_sex.shape)
print(X_test_one_Embarked.shape)
print(X_test_rem.shape)

(268, 1)
(268, 1)
(268, 2)
(623, 4)


In [68]:

X_train_transformed = np.concatenate((X_train_age,X_train_Sex,X_train_one_Embarked,X_train_rem),axis=1)
X_test_taransformed = np.concatenate((X_test_age,X_test_sex,X_test_one_Embarked,X_test_rem),axis=1)


In [69]:
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)
y_prid = clf.predict(X_test_taransformed)

y_prid

array([0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1])

In [71]:
print(len(y_test))
print(len(y_prid))

accuracy_score(y_test,y_prid) * 100

268
268


75.74626865671642

<br>
<br>
<br>
<br>

# Export the model:

<br>
<br>
<br>
<br>

In [72]:

import pickle

pickle.dump(one_embarked,open("model/without_pipeline_embarked",'wb'))
pickle.dump(one_sex,open("model/without_pipeline_sex.pkl",'wb'))
pickle.dump(clf,open('model/without_pipeline.pkl','wb'))
