In [None]:
# Titanic Survival Prediction - Capstone Two Project

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load dataset
import seaborn as sns
df = sns.load_dataset('titanic')

# Preview
print(df.head())

# ----------------------
# Data Wrangling
# ----------------------

# Drop columns not useful for prediction
df = df.drop(['deck', 'embark_town', 'alive', 'class', 'who', 'adult_male', 'alone'], axis=1)

# Handle missing values
df['age'] = df['age'].fillna(df['age'].median())
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])
df['embarked'] = df['embarked'].astype(str)

# Convert categorical to dummy variables
df = pd.get_dummies(df, columns=['sex','embarked'], drop_first=True)

# ----------------------
# EDA
# ----------------------

# 1. Survival by Gender
sns.countplot(x='survived', hue='sex_male', data=df)
plt.title('Survival by Gender')
plt.show()

# 2. Survival by Class (pclass)
sns.countplot(x='pclass', hue='survived', data=df)
plt.title('Survival by Passenger Class')
plt.show()

# 3. Age distribution
sns.histplot(df, x='age', hue='survived', bins=30, kde=True)
plt.title('Age Distribution by Survival')
plt.show()

# ----------------------
# Features & Target
# ----------------------

X = df.drop('survived', axis=1)
y = df['survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----------------------
# Models
# ----------------------

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42)
}

metrics = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    metrics.append([name, acc, prec, rec, f1])
    print(f"\n{name} Classification Report:\n", classification_report(y_test, y_pred))

# ----------------------
# Model Comparison Table
# ----------------------

metrics_df = pd.DataFrame(metrics, columns=['Model','Accuracy','Precision','Recall','F1'])
print("\nModel Comparison:")
print(metrics_df)

# ----------------------
# Final Model (Random Forest)
# ----------------------

final_model = RandomForestClassifier(random_state=42)
final_model.fit(X_train_scaled, y_train)
y_pred_final = final_model.predict(X_test_scaled)

print("\nFinal Model Performance (Random Forest):")
print(classification_report(y_test, y_pred_final))

# Feature importance
importances = final_model.feature_importances_
feature_importance = pd.Series(importances, index=X.columns).sort_values(ascending=False)
print("\nFeature Importance:")
print(feature_importance)

plt.figure(figsize=(8,5))
sns.barplot(x=feature_importance, y=feature_importance.index)
plt.title('Feature Importance - Random Forest')
plt.show()
