# Titanic: High-Accuracy Survival Prediction

This notebook implements advanced feature engineering, robust scaling, and an ensemble modeling approach to maximize prediction accuracy on the Titanic dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from dotenv import load_dotenv

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Load environment variables for Kaggle API
load_dotenv()

# Set display options
pd.set_option('display.max_columns', None)
sns.set(style='whitegrid')

## 1. Data Loading

In [None]:
data_path = r'D:\AI_Study\GitHub\DataScience\pandas\code\scikit-learn\data\titanic'
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))

full_data = [train_df, test_df]

## 2. Advanced Feature Engineering (V2)

In [None]:
for dataset in full_data:
    # 1. Title Extraction
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace(['Mlle', 'Ms'], 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
    # 2. Family Features
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
    # 3. Cabin / Deck Extraction
    dataset['Deck'] = dataset['Cabin'].str.slice(0,1).fillna('N')
    
    # 4. Fill Missing Values (More Granular)
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    dataset['Fare'] = dataset['Fare'].fillna(train_df['Fare'].median())
    
    # Grouped Age Imputation
    dataset['Age'] = dataset.groupby(['Sex', 'Pclass', 'Title'])['Age'].transform(lambda x: x.fillna(x.median()))

    # 5. New Interaction: Fare per Person
    dataset['Fare_per_Person'] = dataset['Fare'] / dataset['FamilySize']
    
    # 6. Interaction: Age * Class
    dataset['Age_Class'] = dataset['Age'] * dataset['Pclass']

# Numeric Mapping
for dataset in full_data:
    dataset['Sex'] = dataset['Sex'].map({'female': 1, 'male': 0}).astype(int)
    
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping).fillna(0)
    
    embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping).astype(int)
    
    deck_mapping = {'N':0, 'C':1, 'B':2, 'D':3, 'E':4, 'A':5, 'F':6, 'G':7, 'T':8}
    dataset['Deck'] = dataset['Deck'].map(deck_mapping).astype(int)

# Final column cleanup
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']
train_processed = train_df.drop(drop_elements, axis=1)
test_processed = test_df.drop(drop_elements, axis=1)

print("Processed Columns:", train_processed.columns.tolist())
train_processed.head()

## 3. Scaling & Split

Scaling is important for models like SVC and Logistic Regression.

In [None]:
X = train_processed.drop('Survived', axis=1)
y = train_processed['Survived']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_processed)

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## 4. Modeling & Ensembling

In [None]:
classifiers = [
    ('rf', RandomForestClassifier(n_estimators=500, max_depth=6, min_samples_split=10, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)),
    ('et', ExtraTreesClassifier(n_estimators=500, max_depth=6, random_state=42)),
    ('svc', SVC(probability=True, kernel='rbf', C=1.0, gamma='auto', random_state=42)),
    ('lr', LogisticRegression(solver='liblinear', random_state=42))
]

vc = VotingClassifier(estimators=classifiers, voting='soft')

# CV Score
cv_scores = cross_val_score(vc, X_scaled, y, cv=10)
print(f"Average 10-Fold CV Accuracy: {cv_scores.mean():.4f}")

vc.fit(X_scaled, y)

## 5. Final Submission & Data Output

In [None]:
# Generate submission
predictions = vc.predict(test_scaled)
submission = pd.DataFrame({"PassengerId": test_df["PassengerId"], "Survived": predictions})
submission.to_csv(os.path.join(data_path, 'submission.csv'), index=False)

# Save refined numeric CSVs
train_out = pd.DataFrame(X_scaled, columns=X.columns)
train_out['Survived'] = y.values
test_out = pd.DataFrame(test_scaled, columns=X.columns)

train_out.to_csv(os.path.join(data_path, 'titanic_refined_train.csv'), index=False)
test_out.to_csv(os.path.join(data_path, 'titanic_refined_test.csv'), index=False)
print("Refined CSVs and submission saved.")

## 6. Submit to Kaggle

In [None]:
!kaggle competitions submit -c titanic -f {os.path.join(data_path, 'submission.csv')} -m "Final High-Accuracy Submission"
!kaggle competitions submissions -c titanic