# Titanic Survival Prediction - Exploratory Data Analysis

This notebook explores the Titanic dataset to understand patterns in passenger survival.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Load Data

In [None]:
# Load training data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print(f"Training set: {train_df.shape[0]} passengers, {train_df.shape[1]} features")
print(f"Test set: {test_df.shape[0]} passengers, {test_df.shape[1]} features")

In [None]:
# First look at the data
train_df.head()

## 2. Data Overview

In [None]:
# Column info and data types
train_df.info()

In [None]:
# Basic statistics
train_df.describe()

## 3. Missing Values Analysis

In [None]:
# Check for missing values
missing = train_df.isnull().sum()
missing_pct = (missing / len(train_df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
})
missing_df[missing_df['Missing Count'] > 0].sort_values(by='Missing %', ascending=False)

In [None]:
# Visualize missing values
plt.figure(figsize=(10, 6))
cols_with_missing = missing[missing > 0].index.tolist()
sns.barplot(x=cols_with_missing, y=missing[cols_with_missing])
plt.title('Missing Values by Column')
plt.ylabel('Count')
plt.xlabel('Column')
plt.tight_layout()
plt.show()

## 4. Survival Analysis

In [None]:
# Overall survival rate
survival_rate = train_df['Survived'].mean()
print(f"Overall survival rate: {survival_rate:.2%}")

# Survival distribution
plt.figure(figsize=(8, 5))
ax = sns.countplot(data=train_df, x='Survived', palette='viridis')
plt.title('Survival Distribution')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Count')

# Add counts on bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', 
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='bottom')
plt.tight_layout()
plt.show()

## 5. Survival by Sex

In [None]:
# Survival by sex
survival_by_sex = train_df.groupby('Sex')['Survived'].mean()
print("Survival rate by Sex:")
print(survival_by_sex)

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
sns.countplot(data=train_df, x='Sex', hue='Survived', palette='viridis')
plt.title('Survival Count by Sex')
plt.legend(title='Survived', labels=['No', 'Yes'])

plt.subplot(1, 2, 2)
sns.barplot(x=survival_by_sex.index, y=survival_by_sex.values, palette='viridis')
plt.title('Survival Rate by Sex')
plt.ylabel('Survival Rate')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

## 6. Survival by Passenger Class

In [None]:
# Survival by passenger class
survival_by_class = train_df.groupby('Pclass')['Survived'].mean()
print("Survival rate by Pclass:")
print(survival_by_class)

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
sns.countplot(data=train_df, x='Pclass', hue='Survived', palette='viridis')
plt.title('Survival Count by Passenger Class')
plt.legend(title='Survived', labels=['No', 'Yes'])

plt.subplot(1, 2, 2)
sns.barplot(x=survival_by_class.index, y=survival_by_class.values, palette='viridis')
plt.title('Survival Rate by Passenger Class')
plt.ylabel('Survival Rate')
plt.xlabel('Pclass (1=1st, 2=2nd, 3=3rd)')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

## 7. Age Distribution

In [None]:
# Age distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(data=train_df, x='Age', bins=30, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')

plt.subplot(1, 2, 2)
sns.histplot(data=train_df, x='Age', hue='Survived', bins=30, kde=True, palette='viridis')
plt.title('Age Distribution by Survival')
plt.xlabel('Age')

plt.tight_layout()
plt.show()

In [None]:
# Age groups and survival
train_df['AgeGroup'] = pd.cut(train_df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                              labels=['Child', 'Teen', 'Young Adult', 'Adult', 'Senior'])

plt.figure(figsize=(10, 5))
sns.barplot(data=train_df, x='AgeGroup', y='Survived', palette='viridis')
plt.title('Survival Rate by Age Group')
plt.ylabel('Survival Rate')
plt.tight_layout()
plt.show()

## 8. Fare Distribution

In [None]:
# Fare distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(data=train_df, x='Fare', bins=50, kde=True)
plt.title('Fare Distribution')
plt.xlabel('Fare ($)')

plt.subplot(1, 2, 2)
sns.boxplot(data=train_df, x='Survived', y='Fare', palette='viridis')
plt.title('Fare by Survival Status')
plt.xlabel('Survived (0 = No, 1 = Yes)')

plt.tight_layout()
plt.show()

## 9. Embarked Port Analysis

In [None]:
# Survival by embarkation port
survival_by_embarked = train_df.groupby('Embarked')['Survived'].mean()
print("Survival rate by Embarked:")
print(survival_by_embarked)

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
sns.countplot(data=train_df, x='Embarked', hue='Survived', palette='viridis')
plt.title('Survival Count by Embarkation Port')
plt.xlabel('Port (C=Cherbourg, Q=Queenstown, S=Southampton)')
plt.legend(title='Survived', labels=['No', 'Yes'])

plt.subplot(1, 2, 2)
sns.barplot(x=survival_by_embarked.index, y=survival_by_embarked.values, palette='viridis')
plt.title('Survival Rate by Embarkation Port')
plt.ylabel('Survival Rate')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

## 10. Family Size Impact

In [None]:
# Create family size feature
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

# Survival by family size
plt.figure(figsize=(10, 5))
sns.barplot(data=train_df, x='FamilySize', y='Survived', palette='viridis')
plt.title('Survival Rate by Family Size')
plt.xlabel('Family Size')
plt.ylabel('Survival Rate')
plt.tight_layout()
plt.show()

## 11. Correlation Analysis

In [None]:
# Prepare numeric data for correlation
numeric_df = train_df.copy()
numeric_df['Sex'] = numeric_df['Sex'].map({'male': 0, 'female': 1})

# Select numeric columns for correlation
corr_cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']
corr_matrix = numeric_df[corr_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, fmt='.2f', square=True)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## 12. Key Insights Summary

Based on the EDA, here are the key findings:

1. **Overall Survival Rate**: About 38% of passengers survived.

2. **Sex**: Women had a much higher survival rate (~74%) compared to men (~19%). This follows the "women and children first" protocol.

3. **Passenger Class**: First-class passengers had the highest survival rate (~63%), followed by 2nd class (~47%), and 3rd class (~24%).

4. **Age**: Children had higher survival rates. There's a slight trend of younger passengers surviving more.

5. **Fare**: Higher fare (indicating higher class) correlates with better survival.

6. **Family Size**: Medium-sized families (2-4 members) had better survival rates than solo travelers or very large families.

7. **Embarked**: Passengers from Cherbourg (C) had the highest survival rate, possibly correlated with class distribution.

8. **Missing Data**: Age (20%), Cabin (77%), and Embarked (0.2%) have missing values that need handling.