# 01 - Exploratory Data Analysis

## Purpose
Perform comprehensive exploratory analysis of Titanic passenger data.

## Objectives
- Analyze survival patterns across passenger demographics
- Examine feature distributions and relationships
- Identify correlations and feature interactions
- Visualize key patterns in passenger profiles
- Detect outliers and data anomalies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('../data/train.csv')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Survival rate overview
survival_rate = df['Survived'].value_counts()
print(f'Survival Rate: {df["Survived"].mean():.1%}')
print('\nSurvival Counts:\n', survival_rate)

# Visualize
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
survival_rate.plot(kind='bar', color=['red', 'green'])
plt.title('Survival Distribution')
plt.xlabel('Survived')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
df.groupby('Pclass')['Survived'].mean().plot(kind='bar', color='steelblue')
plt.title('Survival by Passenger Class')
plt.xlabel('Ticket Class')
plt.ylabel('Survival Rate')
plt.show()

In [None]:
# Gender analysis
gender_survival = pd.crosstab(df['Sex'], df['Survived'], normalize='index')
print('Survival Rate by Gender:\n', gender_survival)

# Age distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
df[df['Survived']==0]['Age'].hist(bins=30, ax=axes[0], alpha=0.7, label='Did not survive')
df[df['Survived']==1]['Age'].hist(bins=30, ax=axes[1], alpha=0.7, label='Survived')
axes[0].set_title('Age Distribution - Non-Survivors')
axes[1].set_title('Age Distribution - Survivors')
plt.show()

In [None]:
# Correlation analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation = df[numeric_cols].corr()

# Visualize correlation with survival
print('Correlation with Survival:\n')
print(correlation['Survived'].sort_values(ascending=False))