In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

import pickle
import warnings
warnings.filterwarnings('ignore')


: 

In [None]:
# Load data using attached file UUIDs or local paths
try:
    # Try local paths first (as mentioned in query)
    train_df = pd.read_csv('/content/sample_data/Titanic_train.csv')
    test_df = pd.read_csv('/content/sample_data/Titanic_test.csv')
    print("Loaded from local paths")
except FileNotFoundError:
    try:
        # Try current directory
        train_df = pd.read_csv('Titanic_train.csv')
        test_df = pd.read_csv('Titanic_test.csv')
        print("Loaded from current directory")
    except FileNotFoundError:
        # Fallback: create sample data or prompt user
        print("Files not found. Using sample Titanic data structure.")
        print("Please place Titanic_train.csv and Titanic_test.csv in current directory")
        print("Or update paths in the code above")
        exit()

In [None]:
print("Data loaded. Train shape:", train_df.shape, "Test shape:", test_df.shape)
print("\nTrain info:")
print(train_df.info())
print("\nTrain describe:")
print(train_df.describe())

In [None]:
# EDA
plt.figure(figsize=(12, 8))
sns.heatmap(train_df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(x='Survived', data=train_df)
plt.title('Survival Distribution')
plt.show()

In [None]:
sns.countplot(x='Sex', hue='Survived', data=train_df)
plt.title('Survival by Sex')
plt.show()

In [None]:
# Data Preprocessing
def preprocess_data(df, is_train=True):
    df = df.copy()

    df['Age'].fillna(df.groupby(['Pclass', 'Sex'])['Age'].transform('median'), inplace=True)
    df['Age'].fillna(df['Age'].median(), inplace=True)

    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

    if 'Fare' in df.columns:
        df['Fare'].fillna(df['Fare'].median(), inplace=True)

    df.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1, errors='ignore', inplace=True)

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    le_sex = LabelEncoder()
    le_emb = LabelEncoder()

    df['Sex'] = le_sex.fit_transform(df['Sex'])
    df['Embarked'] = le_emb.fit_transform(df['Embarked'])

    if is_train:
        return df, le_sex, le_emb
    return df, le_sex, le_emb

In [None]:
train_processed, le_sex, le_emb = preprocess_data(train_df, True)
test_processed, _, _ = preprocess_data(test_df, False)

In [None]:
X = train_processed.drop('Survived', axis=1)
y = train_processed['Survived']

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_processed)

In [None]:
# Model Building
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_val)
y_pred_proba = model.predict_proba(X_val)[:, 1]

In [None]:
# Evaluation
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print(f"Precision: {precision_score(y_val, y_pred):.4f}")
print(f"Recall: {recall_score(y_val, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_val, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_val, y_pred_proba):.4f}")

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc_score(y_val, y_pred_proba):.4f})')
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
# Feature Importance
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0],
    'Abs_Coefficient': np.abs(model.coef_[0])
}).sort_values('Abs_Coefficient', ascending=False)

In [None]:
print("\nFeature Coefficients:")
print(coef_df)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=coef_df.head(10), x='Coefficient', y='Feature')
plt.title('Top 10 Feature Coefficients')
plt.tight_layout()
plt.show()

In [None]:
# Test predictions (add PassengerId for submission)
test_predictions = model.predict(test_scaled)
test_proba = model.predict_proba(test_scaled)[:, 1]

In [None]:
print("\nTest predictions shape:", test_predictions.shape)
print("Sample test predictions:", test_predictions[:10])

In [None]:
# Create submission file with proper PassengerId (892-1310 for test set)
submission = pd.DataFrame({
    'PassengerId': range(892, 892 + len(test_predictions)),
    'Survived': test_predictions
})
submission.to_csv('titanic_submission.csv', index=False)
print("\nPredictions saved to 'titanic_submission.csv'")