# imports 

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# get data

In [2]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# FE

In [3]:
class TitanicFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Copy the data to avoid altering the original dataframe
        X = X.copy()
        
        # 1. Family Size
        X['FamilySize'] = X['SibSp'] + X['Parch'] + 1

        # 2. IsAlone: a binary feature indicating if a passenger is alone
        X['IsAlone'] = np.where(X['FamilySize'] == 1, 1, 0)
        
        # 3. Title Extraction from the Name
        X['Title'] = X['Name'].apply(self.extract_title)
        
        # 4. CabinDeck extraction from Cabin feature (use only first letter as deck)
        X['CabinDeck'] = X['Cabin'].apply(self.extract_deck)
        
        # 5. AgeGroup (binning Age into categories)
        X['AgeGroup'] = pd.cut(X['Age'], bins=[0, 12, 18, 35, 60, np.inf], labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior'])

        # 6. Fare per person (based on family size)
        X['FarePerPerson'] = X['Fare'] / X['FamilySize']
        
        # Drop irrelevant columns for this stage
        X.drop(columns=['Ticket', 'Name', 'Cabin'], inplace=True)
        
        return X

    def extract_title(self, name):
        # Extract title from the name using simple string manipulation
        title = name.split(',')[1].split('.')[0].strip()
        # Normalize some rare titles into a few common ones
        title_mapping = {
            'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master', 
            'Rev': 'Other', 'Dr': 'Other', 'Col': 'Other', 'Major': 'Other', 
            'Mlle': 'Miss', 'Countess': 'Royalty', 'Ms': 'Miss', 'Lady': 'Royalty',
            'Jonkheer': 'Royalty', 'Don': 'Royalty', 'Sir': 'Royalty', 'Mme': 'Mrs', 
            'Capt': 'Other'
        }
        return title_mapping.get(title, 'Other')

    def extract_deck(self, cabin):
        # Extract the first letter from the Cabin string (indicating the deck)
        if pd.isnull(cabin):
            return 'Unknown'
        else:
            return cabin[0]



In [4]:
class ExtendedTitanicFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # 1. Family Size
        X['FamilySize'] = X['SibSp'] + X['Parch'] + 1

        # 2. IsAlone: a binary feature indicating if a passenger is alone
        X['IsAlone'] = np.where(X['FamilySize'] == 1, 1, 0)

        # 3. Title Extraction from the Name
        X['Title'] = X['Name'].apply(self.extract_title)

        # 4. CabinDeck extraction from Cabin feature (use only first letter as deck)
        X['CabinDeck'] = X['Cabin'].apply(self.extract_deck)

        # 5. AgeGroup (binning Age into categories)
        X['AgeGroup'] = pd.cut(X['Age'], bins=[0, 12, 18, 35, 60, np.inf], labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior'])

        # 6. Fare per person (based on family size)
        X['FarePerPerson'] = X['Fare'] / X['FamilySize']

        # 7. Fare Binning
        X['FareBin'] = pd.qcut(X['Fare'], 4, labels=['Low', 'Medium', 'High', 'Premium'])

        # 8. Ticket Frequency (counting occurrences of the same ticket)
        X['TicketFrequency'] = X.groupby('Ticket')['Ticket'].transform('count')

        # 9. Surname extraction and grouping
        X['Surname'] = X['Name'].apply(lambda name: name.split(',')[0])
        X['SurnameGroup'] = X.groupby('Surname')['Surname'].transform('count')

        # 10. Child/Mother Indicator
        X['IsChild'] = np.where(X['Age'] < 16, 1, 0)
        X['IsMother'] = np.where((X['Sex'] == 'female') & (X['Parch'] > 0) & (X['SibSp'] == 0), 1, 0)

        # 11. Interaction Feature: Sex * Pclass
        X['Sex_Pclass'] = X['Sex'] + '_' + X['Pclass'].astype(str)

        # 12. Cabin availability (whether Cabin was recorded or not)
        X['HasCabin'] = np.where(X['Cabin'].isnull(), 0, 1)

        # Drop irrelevant columns
        X.drop(columns=['Ticket', 'Name', 'Cabin'], inplace=True)
        
        return X

    def extract_title(self, name):
        # Extract title from the name using simple string manipulation
        title = name.split(',')[1].split('.')[0].strip()
        title_mapping = {
            'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master', 
            'Rev': 'Other', 'Dr': 'Other', 'Col': 'Other', 'Major': 'Other', 
            'Mlle': 'Miss', 'Countess': 'Royalty', 'Ms': 'Miss', 'Lady': 'Royalty',
            'Jonkheer': 'Royalty', 'Don': 'Royalty', 'Sir': 'Royalty', 'Mme': 'Mrs', 
            'Capt': 'Other'
        }
        return title_mapping.get(title, 'Other')

    def extract_deck(self, cabin):
        # Extract the first letter from the Cabin string (indicating the deck)
        if pd.isnull(cabin):
            return 'Unknown'
        else:
            return cabin[0]



In [5]:
# Instantiate the transformer
feature_engineer = ExtendedTitanicFeatureEngineer()
# Apply the feature engineering transformation to your data (train_data in this case)
transformed_data = feature_engineer.fit_transform(train_data)
# Display the first few rows of the transformed data
transformed_data


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,...,AgeGroup,FarePerPerson,FareBin,TicketFrequency,Surname,SurnameGroup,IsChild,IsMother,Sex_Pclass,HasCabin
0,1,0,3,male,22.0,1,0,7.2500,S,2,...,Young Adult,3.62500,Low,1,Braund,2,0,0,male_3,0
1,2,1,1,female,38.0,1,0,71.2833,C,2,...,Adult,35.64165,Premium,1,Cumings,1,0,0,female_1,1
2,3,1,3,female,26.0,0,0,7.9250,S,1,...,Young Adult,7.92500,Medium,1,Heikkinen,1,0,0,female_3,0
3,4,1,1,female,35.0,1,0,53.1000,S,2,...,Young Adult,26.55000,Premium,2,Futrelle,2,0,0,female_1,1
4,5,0,3,male,35.0,0,0,8.0500,S,1,...,Young Adult,8.05000,Medium,1,Allen,2,0,0,male_3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S,1,...,Young Adult,13.00000,Medium,1,Montvila,1,0,0,male_2,0
887,888,1,1,female,19.0,0,0,30.0000,S,1,...,Young Adult,30.00000,High,1,Graham,3,0,0,female_1,1
888,889,0,3,female,,1,2,23.4500,S,4,...,,5.86250,High,2,Johnston,2,0,0,female_3,0
889,890,1,1,male,26.0,0,0,30.0000,C,1,...,Young Adult,30.00000,High,1,Behr,1,0,0,male_1,1


In [6]:
# Define the pipeline
pipeline = Pipeline(steps=[
    ('feature_engineering', TitanicFeatureEngineer()),  # Custom feature engineering
    ('preprocessing', ColumnTransformer([
        ('num', StandardScaler(), ['Age', 'Fare', 'FamilySize', 'FarePerPerson']),
        ('cat', OneHotEncoder(), ['Sex', 'Embarked', 'Title', 'CabinDeck', 'AgeGroup', 'IsAlone'])
    ]))
])

# Now fit the pipeline on the training data
X_transformed = pipeline.fit_transform(train_data)


In [7]:
# Define the pipeline with the extended feature engineering
pipeline_extended = Pipeline(steps=[
    ('feature_engineering', ExtendedTitanicFeatureEngineer()),  # Custom feature engineering
    ('preprocessing', ColumnTransformer([
        ('num', StandardScaler(), ['Age', 'Fare', 'FamilySize', 'FarePerPerson', 'TicketFrequency', 'SurnameGroup']),
        ('cat', OneHotEncoder(), ['Sex', 'Embarked', 'Title', 'CabinDeck', 'AgeGroup', 'FareBin', 'IsAlone', 'IsChild', 'IsMother', 'Sex_Pclass', 'HasCabin'])
    ]))
])

# Fit the pipeline
X_extended_transformed = pipeline_extended.fit_transform(train_data)