# Task 1: Titanic Dataset - Data Cleaning & Preprocessing
Internship: Elevate Labs

This notebook performs data cleaning and preprocessing on the Titanic dataset.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

## Load the Dataset

In [None]:
df = pd.read_csv('titanic.csv')
df.head()

## Explore the Dataset

In [None]:
print(df.shape)
print(df.info())
print(df.describe())
print(df.isnull().sum())

## Handle Missing Values

In [None]:
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(columns=['Cabin'], inplace=True)
df.isnull().sum()

## Encode Categorical Variables

In [None]:
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
df.head()

## Standardize Numerical Features

In [None]:
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])
df[['Age', 'Fare']].head()

## Outlier Detection & Removal (using IQR)

In [None]:
sns.boxplot(x=df['Fare'])
plt.title('Fare - Before Removing Outliers')
plt.show()

In [None]:
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['Fare'] < (Q1 - 1.5 * IQR)) | (df['Fare'] > (Q3 + 1.5 * IQR)))]

In [None]:
sns.boxplot(x=df['Fare'])
plt.title('Fare - After Removing Outliers')
plt.show()

## Save the Cleaned Dataset (Optional)

In [None]:
df.to_csv('titanic_cleaned.csv', index=False)