# Titanic EDA (Exploratory Data Analysis)

*Goal:* Explore who survived the Titanic disaster and why, using the Kaggle Titanic dataset.

*We will:*
1. Load and preview the data
2. Clean and prepare (missing values, data types, new features)
3. Explore patterns (univariate and bivariate analysis)
4. Visualize insights
5. Summarize findings

*Dataset:* data/train.csv (Kaggle Titanic)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
sns.set_theme()
%config InlineBackend.figure_format = 'retina'

In [None]:
df = pd.read_csv('../data/train.csv')  # if notebook is in notebooks/
df.head()

In [None]:
df.info()
df.describe(include='all').T
df.isna().sum().sort_values(ascending=False)


In [None]:
# Age median
if df['Age'].isna().sum() > 0:
    df['Age'] = df['Age'].fillna(df['Age'].median())

# Embarked mode
if df['Embarked'].isna().sum() > 0:
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Cabin -> HasCabin
if 'Cabin' in df.columns:
    df['HasCabin'] = df['Cabin'].notna().astype(int)
    df = df.drop(columns=['Cabin'])
print(df.isna().sum())


In [None]:
df['Pclass'] = df['Pclass'].astype('category')
df['Sex'] = df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')


In [None]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
df['Title'] = df['Name'].str.extract(r',\s*([^.]*)\s*\.')
rare = ['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona']
df['Title'] = df['Title'].replace({'Mlle':'Miss','Ms':'Miss','Mme':'Mrs'})
df['Title'] = df['Title'].replace({t:'Rare' for t in rare})
df['Title'] = df['Title'].astype('category')
df['FarePerPerson'] = df['Fare'] / df['FamilySize']
df[['Name','Title','FamilySize','IsAlone','Fare','FarePerPerson']].head()


In [None]:
survival_rate = df['Survived'].mean() * 100
print(f"Overall survival rate: {survival_rate:.2f}%")
df['Sex'].value_counts(), df['Pclass'].value_counts()


In [None]:
plt.figure(figsize=(6,4))
plt.hist(df['Age'].dropna(), bins=30)
plt.title('Age Distribution'); plt.xlabel('Age'); plt.ylabel('Count')
plt.show()

plt.figure(figsize=(6,4))
plt.hist(df['Fare'].dropna(), bins=30)
plt.title('Fare Distribution'); plt.xlabel('Fare'); plt.ylabel('Count')
plt.show()


In [None]:
plt.figure(figsize=(6,4)); sns.countplot(data=df, x='Pclass'); plt.title('Passenger Class Counts'); plt.show()
plt.figure(figsize=(6,4)); sns.countplot(data=df, x='Sex'); plt.title('Sex Counts'); plt.show()
plt.figure(figsize=(6,4)); sns.countplot(data=df, x='Embarked'); plt.title('Embarkation Port Counts'); plt.show()


In [None]:
print(pd.crosstab(df['Sex'], df['Survived'], normalize='index') * 100)
plt.figure(figsize=(6,4)); sns.barplot(data=df, x='Sex', y='Survived', estimator=np.mean); plt.title('Survival Rate by Sex'); plt.ylabel('Survival Rate'); plt.show()

In [None]:
plt.figure(figsize=(6,4)); sns.barplot(data=df, x='Pclass', y='Survived', estimator=np.mean); plt.title('Survival Rate by Passenger Class'); plt.show()

In [None]:
plt.figure(figsize=(6,4)); sns.boxplot(data=df, x='Survived', y='Age'); plt.title('Age vs Survival'); plt.show()

age_bins = [0,12,18,30,45,60,80]
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=['Child','Teen','YoungAdult','Adult','MiddleAge','Senior'])
plt.figure(figsize=(8,4)); sns.barplot(data=df, x='AgeGroup', y='Survived', estimator=np.mean); plt.title('Survival Rate by Age Group'); plt.xticks(rotation=30); plt.show()

In [None]:
plt.figure(figsize=(6,4)); sns.barplot(data=df, x='FamilySize', y='Survived', estimator=np.mean); plt.title('Survival Rate by Family Size'); plt.show()
plt.figure(figsize=(6,4)); sns.barplot(data=df, x='IsAlone', y='Survived', estimator=np.mean); plt.title('Survival Rate: Alone vs Not Alone'); plt.xticks([0,1], ['Not Alone','Alone']); plt.show()
plt.figure(figsize=(8,4)); sns.barplot(data=df, x='Title', y='Survived', estimator=np.mean, order=sorted(df['Title'].unique())); plt.title('Survival Rate by Title'); plt.xticks(rotation=30); plt.show()

In [None]:
sns.catplot(data=df, x='Pclass', y='Survived', hue='Sex', kind='bar', height=4, aspect=1.4)
plt.title('Survival by Class and Sex'); plt.show()

In [None]:
from scipy.stats import chi2_contingency, ttest_ind
table = pd.crosstab(df['Sex'], df['Survived'])
chi2, p, dof, expected = chi2_contingency(table)
print('Chi-square p-value (Sex vs Survived):', p)

age_surv = df.loc[df['Survived']==1, 'Age']
age_nsurv = df.loc[df['Survived']==0, 'Age']
from scipy.stats import ttest_ind
print('T-test p-value (Age diff):', ttest_ind(age_surv, age_nsurv, equal_var=False, nan_policy='omit').pvalue)

## Summary of Findings
- Overall survival rate: 38.38%
- Females had higher survival rates.
- 1st class passengers survived more often than 3rd class.
- Traveling alone decreased survival chance.
- Children tended to survive more.
**Limitations:** Age imputed with median, Cabin dropped, etc.
