In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('../input/train.csv', index_col= 'PassengerId')
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
num_col = df.select_dtypes(include=np.number).columns
num_col

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [4]:
print("The percentage of the values missing in the columns:")

(df.isna().sum() / len(df) * 100).round(2)

The percentage of the values missing in the columns:


Survived     0.00
Pclass       0.00
Name         0.00
Sex          0.00
Age         19.87
SibSp        0.00
Parch        0.00
Ticket       0.00
Fare         0.00
Cabin       77.10
Embarked     0.22
dtype: float64

As 77.10% of the values missing in column 'Cabin', it is not possible to fill them with any method. So, I will drop this column.
Age column has 19.87% missing values. I will fill them with the median of the column.
Embarked column has 0.22% missing values. I will fill them with the mode of the column.

In [5]:
df.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)

In [6]:
df.fillna({
    'Age': df['Age'].mean(), 
    'Embarked': df['Embarked'].mode()[0]
    }, 
    inplace=True)

df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


In [7]:
print("The percentage of the values missing in the columns:")

(df.isna().sum() / len(df) * 100).round(2)

The percentage of the values missing in the columns:


Survived    0.0
Pclass      0.0
Sex         0.0
Age         0.0
SibSp       0.0
Parch       0.0
Fare        0.0
Embarked    0.0
dtype: float64

In [8]:
# split that data into X and y
X = df.drop('Survived', axis=1)
y = df['Survived']

# split the data into train and test
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
print(X_train.shape, X_valid.shape)
print(y_train.shape, y_valid.shape)

(712, 7) (179, 7)
(712,) (179,)


In [10]:
X_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
332,1,male,45.5,0,0,28.5,S
734,2,male,23.0,0,0,13.0,S
383,3,male,32.0,0,0,7.925,S
705,3,male,26.0,1,0,7.8542,S
814,3,female,6.0,4,2,31.275,S


In [11]:
num_col = X_train.select_dtypes(include=np.number).columns
num_col

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

### Normalization

In [12]:
scaler = StandardScaler()
X_train[num_col] = scaler.fit_transform(X_train[num_col])
X_valid[num_col] = scaler.transform(X_valid[num_col])

In [13]:
print(X_train.shape, X_valid.shape)

(712, 7) (179, 7)


In [14]:
(X_train.isna().sum() / len(df) * 100).round(2)

Pclass      0.0
Sex         0.0
Age         0.0
SibSp       0.0
Parch       0.0
Fare        0.0
Embarked    0.0
dtype: float64

In [15]:
X_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
332,-1.614136,male,1.229207,-0.470722,-0.479342,-0.078684,S
734,-0.400551,male,-0.503505,-0.470722,-0.479342,-0.377145,S
383,0.813034,male,0.18958,-0.470722,-0.479342,-0.474867,S
705,0.813034,male,-0.272477,0.379923,-0.479342,-0.47623,S
814,0.813034,female,-1.812666,2.93186,2.048742,-0.025249,S


In [16]:
print(X_train[['Sex', 'Embarked']].shape)

(712, 2)


### One Hot Encoding

In [17]:
X_train = X_train.reset_index(drop=True)
X_valid = X_valid.reset_index(drop=True)

In [18]:
encoder = OneHotEncoder(drop='first', sparse_output=False)

# Fit and transform the training data
encoded_train = encoder.fit_transform(X_train[['Sex', 'Embarked']])
encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(['Sex', 'Embarked']))

# Transform the test data
encoded_test = encoder.transform(X_valid[['Sex', 'Embarked']])
encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(['Sex', 'Embarked']))

# Concatenate the encoded columns back to the original DataFrames
X_train = pd.concat([X_train.drop(['Sex', 'Embarked'], axis=1), encoded_train_df], axis=1)
X_valid = pd.concat([X_valid.drop(['Sex', 'Embarked'], axis=1), encoded_test_df], axis=1)

In [19]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,-1.614136,1.229207,-0.470722,-0.479342,-0.078684,1.0,0.0,1.0
1,-0.400551,-0.503505,-0.470722,-0.479342,-0.377145,1.0,0.0,1.0
2,0.813034,0.18958,-0.470722,-0.479342,-0.474867,1.0,0.0,1.0
3,0.813034,-0.272477,0.379923,-0.479342,-0.47623,1.0,0.0,1.0
4,0.813034,-1.812666,2.93186,2.048742,-0.025249,0.0,0.0,1.0


## Logistic Regression

In [20]:
# instantiate the model
model = LogisticRegression()

# fit the model
model.fit(X_train, y_train)

# predict the values
y_valid_pred = model.predict(X_valid)

# check the accuracy
print("The accuracy of the model is:", accuracy_score(y_valid, y_valid_pred))

The accuracy of the model is: 0.8100558659217877
