<a href="https://colab.research.google.com/github/swopnimghimire-123123/Machine-Learning-Journey/blob/main/29.1_Titanic_without_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Titanic without using pipeline

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv("/content/titanic.csv")

In [None]:
df.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
786,787,1,3,"Sjoblom, Miss. Anna Sofia",female,18.0,0,0,3101265,7.4958,,S
788,789,1,3,"Dean, Master. Bertram Vere",male,1.0,1,2,C.A. 2315,20.575,,S
435,436,1,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0,B96 B98,S


In [None]:
# Deleting the unnecessary files
df.drop(["PassengerId","Name","Ticket","Cabin"],axis=1,inplace=True)

In [None]:
df.sample(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
2,1,3,female,26.0,0,0,7.925,S
56,1,2,female,21.0,0,0,10.5,S
34,0,1,male,28.0,1,0,82.1708,C


In [None]:
# Step -> 1 train/test/split
X_train,X_test,Y_train,Y_test = train_test_split(df.drop(columns=["Survived"]),
                                                 df["Survived"],
                                                 test_size = 0.2,
                                                 random_state = 42)

In [None]:
X_train.sample(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
71,3,female,16.0,5,2,46.9,S
173,3,male,21.0,0,0,7.925,S
273,1,male,37.0,0,1,29.7,C


In [None]:
Y_train.sample(3)

Unnamed: 0,Survived
703,0
880,1
747,1


In [None]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


In [None]:
# There are 177 empty values in age so we need to remove it
# Applying Imputation
si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy="most_frequent")

X_train_age = si_age.fit_transform(X_train[["Age"]])
X_train_embarked = si_embarked.fit_transform(X_train[["Embarked"]])

X_test_age = si_age.transform(X_test[["Age"]])
X_test_embarked = si_embarked.transform(X_test[["Embarked"]])

In [None]:
X_test_embarked

array([['C'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
      

In [None]:
df.sample(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
275,1,1,female,63.0,1,0,77.9583,S
803,1,3,male,0.42,0,1,8.5167,C
309,1,1,female,30.0,0,0,56.9292,C


In [None]:
# OneHotEncoding sex and embarked

ohe_sex = OneHotEncoder(handle_unknown='ignore')
ohe_embarked = OneHotEncoder(handle_unknown='ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)

X_test_sex = ohe_sex.transform(X_test[['Sex']])
X_test_embarked = ohe_embarked.transform(X_test_embarked)

In [None]:
print(X_train_embarked)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 712 stored elements and shape (712, 3)>
  Coords	Values
  (0, 2)	1.0
  (1, 2)	1.0
  (2, 2)	1.0
  (3, 2)	1.0
  (4, 2)	1.0
  (5, 0)	1.0
  (6, 2)	1.0
  (7, 0)	1.0
  (8, 2)	1.0
  (9, 2)	1.0
  (10, 2)	1.0
  (11, 2)	1.0
  (12, 2)	1.0
  (13, 2)	1.0
  (14, 2)	1.0
  (15, 2)	1.0
  (16, 2)	1.0
  (17, 0)	1.0
  (18, 2)	1.0
  (19, 2)	1.0
  (20, 2)	1.0
  (21, 2)	1.0
  (22, 2)	1.0
  (23, 2)	1.0
  (24, 2)	1.0
  :	:
  (687, 2)	1.0
  (688, 2)	1.0
  (689, 2)	1.0
  (690, 0)	1.0
  (691, 0)	1.0
  (692, 0)	1.0
  (693, 2)	1.0
  (694, 2)	1.0
  (695, 2)	1.0
  (696, 2)	1.0
  (697, 2)	1.0
  (698, 2)	1.0
  (699, 1)	1.0
  (700, 1)	1.0
  (701, 2)	1.0
  (702, 2)	1.0
  (703, 2)	1.0
  (704, 2)	1.0
  (705, 0)	1.0
  (706, 2)	1.0
  (707, 2)	1.0
  (708, 2)	1.0
  (709, 2)	1.0
  (710, 2)	1.0
  (711, 2)	1.0


In [None]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S


In [None]:
X_train_rem = X_train.drop(columns=['Sex','Age','Embarked'])
X_test_rem = X_test.drop(columns=['Sex','Age','Embarked'])

In [None]:
X_train_transformed = np.concatenate((X_train_rem.values,X_train_age,X_train_sex.toarray(),X_train_embarked.toarray()),axis=1)
 = np.concatX_test_transformedenate((X_test_rem.values,X_test_age,X_test_sex.toarray(),X_test_embarked.toarray()),axis=1)

In [None]:
X_test_transformed.shape

(179, 10)

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,Y_train)

In [None]:
Y_pred = clf.predict(X_test_transformed)
Y_pred

array([0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,Y_pred)

0.7877094972067039

In [None]:
import pickle

In [None]:
pickle.dump(ohe_sex,open('ohe_sex.pkl','wb'))
pickle.dump(ohe_embarked,open('ohe_embarked.pkl','wb'))
pickle.dump(clf,open('clf.pkl','wb'))