# ðŸš¢ Titanic Survival â€“ Exploratory Data Analysis

This notebook covers:
1. Data loading & overview
2. Missing value analysis
3. Univariate distributions
4. Survival rates by key features
5. Correlation heatmap
6. Feature engineering preview

In [None]:
import sys, warnings
sys.path.insert(0, '..')
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style='darkgrid', palette='muted')
plt.rcParams['figure.dpi'] = 120

## 1. Load Data

In [None]:
from src.data_loader import load_raw_data
train_df, test_df = load_raw_data()
print(f'Train: {train_df.shape}  |  Test: {test_df.shape}')
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

## 2. Missing Values

In [None]:
missing = train_df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(8, 4))
missing.plot(kind='bar', ax=ax, color='steelblue')
ax.set_title('Missing Values per Column', fontsize=13)
ax.set_ylabel('Count')
plt.tight_layout(); plt.show()

## 3. Survival Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Overall survival
train_df['Survived'].value_counts().plot(kind='pie', ax=axes[0],
    labels=['Not Survived', 'Survived'], autopct='%1.1f%%',
    colors=['#e74c3c', '#2ecc71'], startangle=90)
axes[0].set_title('Overall Survival Rate')
axes[0].set_ylabel('')

# By sex
sns.barplot(data=train_df, x='Sex', y='Survived', ax=axes[1],
            estimator=np.mean, errorbar=None, palette='Set2')
axes[1].set_title('Survival Rate by Sex')
axes[1].set_ylabel('Survival Rate')
plt.tight_layout(); plt.show()

## 4. Survival by Pclass & Age

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.barplot(data=train_df, x='Pclass', y='Survived', ax=axes[0],
            estimator=np.mean, errorbar=None, palette='Blues_d')
axes[0].set_title('Survival Rate by Passenger Class')

sns.histplot(data=train_df, x='Age', hue='Survived', kde=True,
             ax=axes[1], bins=30, palette={0:'#e74c3c', 1:'#2ecc71'})
axes[1].set_title('Age Distribution by Survival')
plt.tight_layout(); plt.show()

## 5. Fare Distribution

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
sns.boxplot(data=train_df, x='Pclass', y='Fare', hue='Survived', ax=ax,
            palette={0:'#e74c3c', 1:'#2ecc71'})
ax.set_title('Fare Distribution by Class and Survival')
ax.set_yscale('log')
plt.tight_layout(); plt.show()

## 6. Correlation Heatmap

In [None]:
from src.feature_engineering import engineer_features
from src.preprocessing import preprocess

df_fe = preprocess(engineer_features(train_df.copy()))
numeric_cols = df_fe.select_dtypes(include=[np.number]).columns

fig, ax = plt.subplots(figsize=(12, 8))
corr = df_fe[numeric_cols].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='coolwarm',
            vmin=-1, vmax=1, ax=ax, linewidths=0.5)
ax.set_title('Feature Correlation Heatmap', fontsize=13)
plt.tight_layout(); plt.show()

## 7. Engineered Feature Preview

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.barplot(data=df_fe, x='FamilySize', y='Survived', ax=axes[0],
            estimator=np.mean, errorbar=None, palette='viridis')
axes[0].set_title('Survival Rate by Family Size')

sns.barplot(data=df_fe, x='Title', y='Survived', ax=axes[1],
            estimator=np.mean, errorbar=None, palette='plasma')
axes[1].set_title('Survival Rate by Title (encoded)')
plt.tight_layout(); plt.show()