# 🧼 Data Preprocessing Tutorial (Step-by-Step)
This notebook will walk you through data preprocessing using the Titanic dataset.

In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Step 2: Load Sample Dataset
df = sns.load_dataset("titanic")
df.head()

In [None]:
# Step 3: Understand the Data
df.info()
df.describe()
df.isnull().sum()

In [None]:
# Step 4: Drop Irrelevant Columns
df = df.drop(columns=['deck', 'embark_town', 'alive'])

In [None]:
# Step 5: Handle Missing Values
df['age'].fillna(df['age'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

In [None]:
# Step 6: Convert Categorical to Numeric
# Label Encoding for binary features
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])

# One-Hot Encoding for multi-class features
df = pd.get_dummies(df, columns=['class', 'embarked'], drop_first=True)

In [None]:
# Step 7: Feature Scaling
scaler = StandardScaler()
numeric_cols = ['age', 'fare']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:
# Step 8: Train/Test Split
X = df.drop(columns=['survived'])
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Optional: Save Preprocessed Data
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)