In [1]:
import pandas as pd
import numpy as np


In [8]:
data = pd.read_csv("test_passenger.csv")

In [9]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [10]:
data.shape

(418, 12)

In [12]:
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [21]:
# Missing Value treatment 

In [23]:
# Fill missing Age
data['Age'] = data.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))


In [25]:
# Fill missing Fare
data['Fare'].fillna(data['Fare'].median(), inplace=True)

In [26]:
# Create Deck from Cabin
data['Deck'] = data['Cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'U')
data.drop(columns=['Cabin'], inplace=True)

In [28]:
data.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Deck           0
dtype: int64

In [None]:
# not there is not missing values 

In [29]:
# checking duplicates

In [30]:
np.sum(data.duplicated())

0

In [31]:
# Doing the outlier treatment 

In [32]:
def out_trt(ref_column):
    if (ref_column.dtype =="float64") | (ref_column.dtype == "int64"):
        out_trt_column = np.where(ref_column >= ref_column.quantile(0.99), ref_column.quantile(0.99), 
        np.where(ref_column <= ref_column.quantile(0.01),ref_column.quantile(0.01),ref_column))
    else:
        out_trt_column = ref_column
    return out_trt_column

In [33]:
data.apply(lambda x : out_trt(x), axis = 0)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Deck
0,896.17,0.0,3.0,"Kelly, Mr. James",male,34.5,0.0,0.0,330911,7.8292,Q,U
1,896.17,1.0,3.0,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1.0,0.0,363272,7.0000,S,U
2,896.17,0.0,2.0,"Myles, Mr. Thomas Francis",male,62.0,0.0,0.0,240276,9.6875,Q,U
3,896.17,0.0,3.0,"Wirz, Mr. Albert",male,27.0,0.0,0.0,315154,8.6625,S,U
4,896.17,1.0,3.0,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1.0,1.0,3101298,12.2875,S,U
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1304.83,0.0,3.0,"Spector, Mr. Woolf",male,24.0,0.0,0.0,A.5. 3236,8.0500,S,U
414,1304.83,1.0,1.0,"Oliva y Ocana, Dona. Fermina",female,39.0,0.0,0.0,PC 17758,108.9000,C,C
415,1304.83,0.0,3.0,"Saether, Mr. Simon Sivertsen",male,38.5,0.0,0.0,SOTON/O.Q. 3101262,7.2500,S,U
416,1304.83,0.0,3.0,"Ware, Mr. Frederick",male,24.0,0.0,0.0,359309,8.0500,S,U


In [34]:
# Feature Engineering 

In [40]:
''' why are we going to extract the title 

The Title (Mr., Mrs., Miss., Master, etc.) can be:

A proxy for age, gender, and social status.

A categorical feature that may impact survival. For example:

Master (young boys) had a relatively high survival rate.

Mrs and Miss also had higher chances compared to Mr.


'''

' why are we going to extract the title \n\nThe Title (Mr., Mrs., Miss., Master, etc.) can be:\n\nA proxy for age, gender, and social status.\n\nA categorical feature that may impact survival. For example:\n\nMaster (young boys) had a relatively high survival rate.\n\nMrs and Miss also had higher chances compared to Mr.\n\n\n'

In [35]:
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
# SibSp: Number of siblings or spouses aboard the Titanic with the passenger
# Parch: Number of parents or children aboard the Titanic with the passenger
''' 
SibSp = 1 (e.g., traveling with spouse)

Parch = 2 (e.g., traveling with two children)

FamilySize = 1 + 2 + 1 = 4 (3 relatives + self)
'''

In [36]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

In [37]:
data['IsAlone'] = (data['FamilySize'] == 1).astype(int)

In [38]:
data.drop(columns=['Name', 'Ticket'], inplace=True)

In [39]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,FamilySize,IsAlone
0,892,0,3,male,34.5,0,0,7.8292,Q,U,Mr,1,1
1,893,1,3,female,47.0,1,0,7.0,S,U,Mrs,2,0
2,894,0,2,male,62.0,0,0,9.6875,Q,U,Mr,1,1
3,895,0,3,male,27.0,0,0,8.6625,S,U,Mr,1,1
4,896,1,3,female,22.0,1,1,12.2875,S,U,Mrs,3,0


In [41]:
#  Normalize / Scale

In [42]:
from sklearn.preprocessing import StandardScaler




In [44]:
scaler = StandardScaler()
data[['Age', 'Fare']] = scaler.fit_transform(data[['Age', 'Fare']])

In [45]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,FamilySize,IsAlone
0,892,0,3,male,0.399451,0,0,-0.497413,Q,U,Mr,1,1
1,893,1,3,female,1.359273,1,0,-0.512278,S,U,Mrs,2,0
2,894,0,2,male,2.511059,0,0,-0.464100,Q,U,Mr,1,1
3,895,0,3,male,-0.176442,0,0,-0.482475,S,U,Mr,1,1
4,896,1,3,female,-0.560371,1,1,-0.417492,S,U,Mrs,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,male,-0.406799,0,0,-0.493455,S,U,Mr,1,1
414,1306,1,1,female,0.744987,0,0,1.314435,C,C,Dona,1,1
415,1307,0,3,male,0.706594,0,0,-0.507796,S,U,Mr,1,1
416,1308,0,3,male,-0.406799,0,0,-0.493455,S,U,Mr,1,1


In [46]:
# Lable Encoding 

In [47]:
from sklearn.preprocessing import LabelEncoder


In [48]:
for col in ['Sex', 'Embarked', 'Title', 'Deck']:
    data[col] = LabelEncoder().fit_transform(data[col])

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [60]:
from sklearn.metrics import classification_report, accuracy_score

In [50]:
X = data.drop(['PassengerId', 'Survived'], axis=1)
y = data['Survived']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [52]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [58]:
y_pred = model.predict(X_test)

In [61]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [62]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84

