In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

# Load Titanic dataset
titanic = sns.load_dataset('titanic')

# Handle missing values
titanic['age'].fillna(titanic['age'].median(), inplace=True)
titanic['embarked'].fillna(titanic['embarked'].mode()[0], inplace=True)

# Drop irrelevant columns
titanic.drop(columns=['deck', 'embark_town', 'alive'], inplace=True)

# Encode categorical variables
titanic['sex'] = titanic['sex'].map({'male': 0, 'female': 1})
titanic['embarked'] = titanic['embarked'].map({'C': 0, 'Q': 1, 'S': 2})
titanic['class'] = titanic['class'].map({'First': 1, 'Second': 2, 'Third': 3})

# Drop remaining columns with too many NaNs or irrelevant information
titanic.drop(columns=['who', 'adult_male', 'alone'], inplace=True)

# Separate features and target
X = titanic.drop(columns=['survived'])
y = titanic['survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Standardize numerical features
scaler = StandardScaler()
X_train[['age', 'fare']] = scaler.fit_transform(X_train[['age', 'fare']])
X_test[['age', 'fare']] = scaler.transform(X_test[['age', 'fare']])

# Initialize all models
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(probability=True),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Dictionary to store results
results = {}

# Train and evaluate all models
for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Store results
    results[name] = {
        'y_pred': y_pred,
        'y_prob': y_prob,
        'accuracy': accuracy_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_prob)
    }
    
    # Print classification report
    print(f"\n{name} Report:")
    print(classification_report(y_test, y_pred))

# Plot ROC curves
plt.figure(figsize=(12, 8))

for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['y_prob'])
    plt.plot(fpr, tpr, label=f'{name} (AUC = {result["auc"]:.2f})')

# Add random classifier
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')

# Finalize ROC plot
plt.title("ROC Curve Comparison")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)
plt.show()

# Plot confusion matrices
def plot_confusion_matrix(y_true, y_pred, title):
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=["Not Survived", "Survived"], 
                yticklabels=["Not Survived", "Survived"])
    plt.title(f"{title} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# Plot confusion matrix for each model
for name, result in results.items():
    plot_confusion_matrix(y_test, result['y_pred'], name)

# Feature Importance Analysis

# Get the best performing model based on AUC
best_model_name = max(results.items(), key=lambda x: x[1]['auc'])[0]
print(f"\nBest performing model: {best_model_name}")

# Function to get feature importance
def get_feature_importance(model, feature_names):
    if isinstance(model, RandomForestClassifier):
        return model.feature_importances_
    elif isinstance(model, LogisticRegression):
        return np.abs(model.coef_[0])
    elif isinstance(model, DecisionTreeClassifier):
        return model.feature_importances_
    else:
        return None

# Get feature importance for the best model
best_model = models[best_model_name]
importance = get_feature_importance(best_model, X.columns)

if importance is not None:
    # Create feature importance plot
    plt.figure(figsize=(12, 6))
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': importance
    })
    feature_importance = feature_importance.sort_values('importance', ascending=True)
    
    plt.barh(range(len(importance)), feature_importance['importance'])
    plt.yticks(range(len(importance)), feature_importance['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Feature Importance ({best_model_name})')
    plt.tight_layout()
    plt.show()