In [3]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from pathlib import Path

# --- 1. DATA LOADING ---
def load_titanic_data():
    train_path = Path("datasets/titanic/train.csv")
    test_path = Path("datasets/titanic/test.csv")
    return pd.read_csv(train_path), pd.read_csv(test_path)

train_data, test_data = load_titanic_data()
y_train = train_data["Survived"].copy()
X_train = train_data.drop("Survived", axis=1) # Ensure we don't cheat

# --- 2. CUSTOM TRANSFORMERS ---

# A. Title Selector: Extracts Mr, Mrs, Master, etc.
class TitleSelector(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        # Extract title from Name
        X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        # Group rare titles
        X['Title'] = X['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 
                                         'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        X['Title'] = X['Title'].replace(['Mlle', 'Ms'], 'Miss')
        X['Title'] = X['Title'].replace('Mme', 'Mrs')
        return X[['Title']]

# B. Family Size: Combines SibSp + Parch + 1 (Self)
class FamilySize(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        # Create FamilySize
        family_size = X["SibSp"] + X["Parch"] + 1
        return family_size.to_frame()

# C. Smart Age Imputer: Fills missing Age based on Title (not global median)
class SmartAgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Calculate median age for each Title in the training set
        # We need to extract titles temporarily to learn the medians
        temp_df = X.copy()
        temp_df['Title'] = temp_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        temp_df['Title'] = temp_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 
                                         'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        temp_df['Title'] = temp_df['Title'].replace(['Mlle', 'Ms'], 'Miss')
        temp_df['Title'] = temp_df['Title'].replace('Mme', 'Mrs')
        
        self.age_map_ = temp_df.groupby("Title")["Age"].median()
        return self
    
    def transform(self, X):
        X = X.copy()
        # Extract titles again for the transform step
        titles = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        titles = titles.replace(['Lady', 'Countess','Capt', 'Col','Don', 
                                         'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        titles = titles.replace(['Mlle', 'Ms'], 'Miss')
        titles = titles.replace('Mme', 'Mrs')
        
        # Map the learnt medians to the missing values
        def fill_age(row):
            if pd.isna(row['Age']):
                return self.age_map_.get(titles.iloc[row.name], X['Age'].median())
            return row['Age']
            
        # We need to reset index to ensure alignment, then restore it
        X_reset = X.reset_index(drop=True)
        # Note: In a real pipeline, complex row-wise operations can be slow, 
        # but for Titanic (891 rows) this is fine and safe.
        # For simplicity in this script, we'll use a simpler fillna approach:
        
        # Optimized Vectorized Fill:
        for title, median_age in self.age_map_.items():
             mask = (titles == title) & (X['Age'].isna())
             X.loc[mask, 'Age'] = median_age
             
        return X[['Age']]

# --- 3. PIPELINES ---

# A. Numerical Pipeline (Age, Fare)
# Note: We use SmartAgeImputer for Age, but SimpleImputer for Fare
age_pipeline = Pipeline([
    ("smart_imputer", SmartAgeImputer()),
    ("scaler", StandardScaler())
])

fare_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# B. Categorical Pipeline (Sex, Embarked)
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# C. Title Pipeline
title_pipeline = Pipeline([
    ("extractor", TitleSelector()),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# D. Family Pipeline
family_pipeline = Pipeline([
    ("calculator", FamilySize()),
    ("scaler", StandardScaler())
])

# Master Preprocessor
preprocess_pipeline = ColumnTransformer([
    ("age_proc", age_pipeline, ["Name", "Age"]), # Pass Name to get Title for Age
    ("fare_proc", fare_pipeline, ["Fare"]),
    ("cat_proc", cat_pipeline, ["Sex", "Embarked"]),
    ("title_proc", title_pipeline, ["Name"]),
    ("family_proc", family_pipeline, ["SibSp", "Parch"]),
    ("ord_proc", OrdinalEncoder(), ["Pclass"])
])

# --- 4. MODEL & SEARCH ---

full_pipeline = Pipeline([
    ("preprocessor", preprocess_pipeline),
    ("model", RandomForestClassifier(random_state=42))
])

# We constrain the search to prevent overfitting
param_dist = {
    "model__n_estimators": randint(200, 600),
    "model__max_depth": [5, 8, 10, 12],     # Capped at 12
    "model__min_samples_split": [5, 10, 15], # Require more samples to split
    "model__min_samples_leaf": [2, 4, 8],    # Require more samples in leaves
    "model__max_features": ["sqrt", "log2"]
}

random_search = RandomizedSearchCV(
    full_pipeline, 
    param_distributions=param_dist, 
    n_iter=50, 
    cv=5, 
    scoring='accuracy', 
    verbose=1, 
    n_jobs=-1, 
    random_state=42
)

print("Starting training...")
random_search.fit(X_train, y_train)

print(f"Best CV Score: {random_search.best_score_:.4f}")
print("Best Parameters:", random_search.best_params_)

# --- 5. SUBMISSION ---
final_model = random_search.best_estimator_
test_predictions = final_model.predict(test_data)

submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": test_predictions
})

submission.to_csv("submission_optimized.csv", index=False)
print("Saved 'submission_optimized.csv'. Ready for Kaggle!")

Starting training...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best CV Score: 0.8339
Best Parameters: {'model__max_depth': 5, 'model__max_features': 'log2', 'model__min_samples_leaf': 4, 'model__min_samples_split': 15, 'model__n_estimators': 585}
Saved 'submission_optimized.csv'. Ready for Kaggle!
