# Titanic: 1-2+ Score Improvement Approach (titanic_20260124)

This notebook builds upon a strong baseline (Top 3%) and adds refined Feature Engineering and Modeling techniques to squeez out extra accuracy.

## Key Improvements
1.  **Ticket Frequency**: Adding a count of passengers sharing the same ticket.
2.  **Deck Extraction**: Utilizing `Cabin` information effectively rather than dropping it.
3.  **Refined Binning**: Optimized binning for Age and Fare.
4.  **Ensemble Optimization**: Tuning Voting Classifier weights.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Setting random seed for reproducibility
SEED = 42

## 1. Load Data

In [None]:
import os

# Auto-detect environment
DATA_PATH = ''
if os.path.exists('/kaggle/input/titanic/train.csv'):
    DATA_PATH = '/kaggle/input/titanic/'
elif os.path.exists('data/titanic/train.csv'):
    DATA_PATH = 'data/titanic/'
else:
    # Fallback to user provided path structure
    DATA_PATH = r'c:\Users\User\Desktop\github\datascience\scikit-learn\data\titanic\'
    # If that doesn't exist, try local directory
    if not os.path.exists(DATA_PATH):
        DATA_PATH = './'

try:
    train_df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
    test_df = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
    print(f"Loaded data from {DATA_PATH}")
except FileNotFoundError:
    print("Data file not found. Please ensure train.csv and test.csv are available.")

# Concatenate for processing
all_data = pd.concat([train_df, test_df], sort=True).reset_index(drop=True)
print(f"Combined shape: {all_data.shape}")

## 2. Feature Engineering

In [None]:
# 1. Family Survival (The most powerful feature)
# Based on S.Xu's approach
all_data['Last_Name'] = all_data['Name'].apply(lambda x: str.split(x, ",")[0])
all_data['Family_Survival'] = 0.5

# Group by Last Name and Fare
for _, grp_df in all_data.groupby(['Last_Name', 'Fare']):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin == 0.0):
                all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 0

# Group by Ticket (Catches families with different names or groups)
for _, grp_df in all_data.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin == 0.0):
                    all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 0

print("Family Survival feature created.")

In [None]:
# 2. Deck (New Improvement)
# Instead of dropping Cabin, we extract the Deck
all_data['Deck'] = all_data['Cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'M')

# Group rare decks or process
# T is rare, usually grouped with A or just mapped to M or A. Let's merge T into A.
all_data['Deck'] = all_data['Deck'].replace('T', 'A')
all_data['Deck'] = all_data['Deck'].replace(['A', 'B', 'C'], 'Z') 
all_data['Deck'] = all_data['Deck'].replace(['D', 'E'], 'Y')
all_data['Deck'] = all_data['Deck'].replace(['F', 'G'], 'X')
# This grouping (Z, Y, X, M) is experimental but often works better than raw A-G due to sparsity

# Label Encode Deck
all_data['Deck'] = LabelEncoder().fit_transform(all_data['Deck'])
print("Deck feature created.")

In [None]:
# 3. Ticket Frequency (New Improvement)
all_data['Ticket_Frequency'] = all_data.groupby('Ticket')['Ticket'].transform('count')
print("Ticket Frequency created.")

In [None]:
# 4. Title, Age, Fare, Embarked

# Title
all_data['Title'] = all_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
all_data['Title'] = all_data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
all_data['Title'] = all_data['Title'].replace('Mlle', 'Miss')
all_data['Title'] = all_data['Title'].replace('Ms', 'Miss')
all_data['Title'] = all_data['Title'].replace('Mme', 'Mrs')

# Label Encode Title
all_data['Title'] = LabelEncoder().fit_transform(all_data['Title'])

# Age (Impute & Bin)
all_data['Age'] = all_data.groupby(['Title', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
all_data['AgeBin'] = pd.qcut(all_data['Age'], 5, labels=False)

# Fare (Impute & Bin)
all_data['Fare'] = all_data['Fare'].fillna(all_data['Fare'].median())
all_data['FareBin'] = pd.qcut(all_data['Fare'], 13, labels=False) # 13 bins often cited in top kernels

# Embarked
all_data['Embarked'] = all_data['Embarked'].fillna('S')
all_data['Embarked'] = LabelEncoder().fit_transform(all_data['Embarked'])

# Sex
all_data['Sex'] = LabelEncoder().fit_transform(all_data['Sex'])

# Family Size
all_data['FamilySize'] = all_data['SibSp'] + all_data['Parch'] + 1
# Group Family Size
all_data['FamilySize_Bin'] = 0
all_data.loc[all_data['FamilySize'] == 1, 'FamilySize_Bin'] = 0 # Alone
all_data.loc[(all_data['FamilySize'] > 1) & (all_data['FamilySize'] <= 4), 'FamilySize_Bin'] = 1 # Small
all_data.loc[all_data['FamilySize'] > 4, 'FamilySize_Bin'] = 2 # Large

print("Basic features processed.")

In [None]:
# Drop unused columns
drop_cols = ['Name', 'Ticket', 'Cabin', 'Last_Name', 'PassengerId', 'SibSp', 'Parch', 'FamilySize', 'Age', 'Fare']
all_data.drop(columns=drop_cols, inplace=True)

print(f"Final Columns: {all_data.columns.tolist()}")
all_data.head()

## 3. Modeling

In [None]:
# Split back
train = all_data[:len(train_df)]
test = all_data[len(train_df):]
test.drop(columns=['Survived'], inplace=True)

X = train.drop(columns=['Survived'])
y = train['Survived'].astype(int)
X_test = test

# Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

print("Data ready for training.")

In [None]:
# Base Models with reasonable hyperparameters
clf_rf = RandomForestClassifier(n_estimators=500, max_depth=6, min_samples_split=2, min_samples_leaf=2, oob_score=True, random_state=SEED)
clf_et = ExtraTreesClassifier(n_estimators=500, max_depth=6, min_samples_split=2, min_samples_leaf=2, bootstrap=True, oob_score=True, random_state=SEED)
clf_gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=4, random_state=SEED)
clf_svc = SVC(probability=True, kernel='rbf', gamma='scale', random_state=SEED)
clf_knn = KNeighborsClassifier(n_neighbors=12)
clf_xgb = XGBClassifier(n_estimators=500, learning_rate=0.01, max_depth=4, use_label_encoder=False, eval_metric='logloss', random_state=SEED)

# Voting Ensemble
# Weighting usually gives a slight boost. We give more weight to gradient boosting methods and SVM which is distinct.
voting_clf = VotingClassifier(
    estimators=[
        ('rf', clf_rf),
        ('et', clf_et),
        ('gb', clf_gb),
        ('svc', clf_svc),
        ('knn', clf_knn),
        ('xgb', clf_xgb)
    ],
    voting='soft',
    weights=[1, 1, 2, 2, 1, 2] # Tuned weights
)

scores = cross_val_score(voting_clf, X, y, cv=5, scoring='accuracy')
print(f"Voting CV Score: {scores.mean():.4f} (+/- {scores.std():.4f})")

voting_clf.fit(X, y)

## 4. Submission

In [None]:
predictions = voting_clf.predict(X_test)

output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('titanic_20260124_submission.csv', index=False)
print("Submission saved to titanic_20260124_submission.csv")