# Titanic Dataset - Data Cleaning & Preprocessing
This notebook contains all the preprocessing steps applied to the Titanic dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('Titanic-Dataset.csv')
df.head()

# Drop Cabin (too many missing values)
df = df.drop(columns=['Cabin'])

# Impute missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Encode categorical variables
df['Sex'] = df['Sex'].map({'male':1,'female':0})
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

# Scale Age & Fare
scaler = StandardScaler()
df[['Age_scaled','Fare_scaled']] = scaler.fit_transform(df[['Age','Fare']])
df.head()

# Outlier removal using IQR
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR
    return data[(data[column] >= lower) & (data[column] <= upper)]

df = remove_outliers_iqr(df, 'Age')
df = remove_outliers_iqr(df, 'Fare')
df.shape

# Save final cleaned dataset
df.to_csv('titanic_cleaned.csv', index=False)
print('Saved cleaned dataset with shape:', df.shape)