Perform Data Cleaning, Data transformation using Python on any data
set.

In [5]:
import pandas as pd
import numpy as np

In [6]:
# Load the dataset
# Ensure 'Titanic.csv' is in your current directory
df = pd.read_csv('Titanic.csv')

print("--- Initial Data ---")
print(df.head())

--- Initial Data ---
   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  


In [7]:
# 1. Handle Missing Values
# Fill Age with Median
df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill Fare with Median
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Fill Embarked with Mode (Most common value)
if 'Embarked' in df.columns:
    mode_embarked = df['Embarked'].mode()[0]
    df['Embarked'] = df['Embarked'].fillna(mode_embarked)

# 2. Drop Irrelevant Columns
# We remove columns that are not useful for general analysis
cols_to_drop = ['Cabin', 'PassengerId', 'Name', 'Ticket']
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

print("--- Data Cleaned (Missing Values Handled) ---")
print(df.isnull().sum())

--- Data Cleaned (Missing Values Handled) ---
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [8]:
# 1. Numerical Transformation (Mapping)
# Convert 'Sex' to numbers (0=Male, 1=Female)
if 'Sex' in df.columns:
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# 2. Binning (Creating Categories)
# Create Age Groups: Child (0-12), Teen (12-18), Adult (18-60), Senior (60+)
bins = [0, 12, 18, 60, 100]
labels = ['Child', 'Teenager', 'Adult', 'Senior']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)

# Note: We skipped One-Hot Encoding for 'Embarked', so it remains as text (S, C, Q).

print("\n--- Final Transformed Data ---")
print(df.head())
print("\nFinal Data Types:")
print(df.dtypes)


--- Final Transformed Data ---
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare Embarked Age_Group
0         0       3    0  34.5      0      0   7.8292        Q     Adult
1         1       3    1  47.0      1      0   7.0000        S     Adult
2         0       2    0  62.0      0      0   9.6875        Q    Senior
3         0       3    0  27.0      0      0   8.6625        S     Adult
4         1       3    1  22.0      1      1  12.2875        S     Adult

Final Data Types:
Survived        int64
Pclass          int64
Sex             int64
Age           float64
SibSp           int64
Parch           int64
Fare          float64
Embarked       object
Age_Group    category
dtype: object
