In [12]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

df = pd.read_csv("titanic.csv")

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Unnamed: 12,Unnamed: 13,Unnamed: 14,30
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,unknown,S,,,,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,,,,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,unknown,S,,,,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,,,,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,unknown,S,,,,


In [13]:
df.shape

(1309, 16)

In [14]:
df.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin             0
Embarked          2
Unnamed: 12    1309
Unnamed: 13    1309
Unnamed: 14    1309
30             1309
dtype: int64

In [15]:
# Dropping unnecessary columns
columns_to_drop = ['Name', 'Ticket', 'Cabin', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', '30']

df.drop(columns=columns_to_drop, axis=1, inplace=True) 
# axis=1 tells Pandas to drop columns (not rows).
# inplace=True ensures that df is updated directly without needing reassignment like df = df.drop(...).

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [16]:
# Handle missing values
# For 'Age' and 'Fare', we use the median

numerical_cols = ['Age', 'Fare']
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

most_frequent_embark = df['Embarked'].mode()[0]
df['Embarked'] = df['Embarked'].fillna(most_frequent_embark)

df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [17]:
# Feature Engineering

df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

df["FareBin"] = pd.qcut(df["Fare"], 4, labels=["Low", "Medium", "High", "Very High"])

df["AgeBin"] = pd.cut(df["Age"], bins=[0, 12, 18, 35, 60, 80], labels=["Child", "Teen", "Adult", "Middle-Aged", "Senior"])



In [21]:
# Save the cleaned dataset to a CSV file
cleaned_file_path = 'titanic_cleaned.csv'
df.to_csv(cleaned_file_path, index=False)


In [22]:
print(f"Cleaned dataset saved to '{cleaned_file_path}'.")

Cleaned dataset saved to 'titanic_cleaned.csv'.
