In [1]:
from pathlib import Path
import pandas as pd

In [2]:
train_path = Path("../data/processed/train_cleaned.csv").resolve()

out_dir = Path("../data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

train_out_path = out_dir / "train_features.csv"


In [3]:
train_df = pd.read_csv(train_path)

CabinDeck is included as a candidate feature because deck location may encode passenger location/class information that could relate to survival.

In [4]:
def create_cabindeck_feat(df):
    df = df.copy()
    df["CabinDeck"] = (
        df["Cabin"]
        .str.extract(r"([A-Za-z])", expand=False)
        .str.upper()
        .fillna("Unknown")
    )
    return df

FamilySize is included as a candidate feature because traveling alone versus with family may capture behavior or access patterns that could relate to survival.

In [5]:
def create_familysize_feat(df):
    df = df.copy()
    df['FamilySize']=df['SibSp']+df['Parch']+1
    return df

Create a new column to classify the passengers into different age groups

In [6]:
def age_binning(df):
    # Define age bins and labels
    bins = [0, 12, 19, 39, 59, float("inf")]
    labels = ['Child', 'Teen', 'Adult', 'Middle Aged', 'Senior']

    # Create age_bin column
    df['AgeBin'] = pd.cut(
        df['Age'],
        bins=bins,
        labels=labels,
        include_lowest=True
    )
    return df

In [21]:
train_df = age_binning(train_df)
train_df = create_cabindeck_feat(train_df)
train_df = create_familysize_feat(train_df)

# To make sure the drop_first(baseline) is 'S' for Embarked
train_df['Embarked'] = pd.Categorical(train_df["Embarked"], categories=['S','C','Q'], ordered=False)
# Add One-Hot encoding to AgeBin, Embarked, Title, Pclass
train_df_dummies = pd.get_dummies(train_df,columns=["Pclass","AgeBin","Embarked","Title"],dtype=int,drop_first=True)
train_df_encode = pd.concat([train_df, train_df_dummies],axis =1)

train_df_encode.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,AgeBin_Teen,AgeBin_Adult,AgeBin_Middle Aged,AgeBin_Senior,Embarked_C,Embarked_Q,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,...,0,1,0,0,0,0,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,...,0,1,0,0,1,0,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,...,0,1,0,0,0,0,1,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,...,0,1,0,0,0,0,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,...,0,1,0,0,0,0,0,1,0,0
5,6,0,3,"Moran, Mr. James",0,32.37,0,0,330877,8.4583,...,0,1,0,0,0,1,0,1,0,0
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,...,0,0,1,0,0,0,0,1,0,0
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.075,...,0,0,0,0,0,0,0,0,0,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,...,0,1,0,0,0,0,0,0,1,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,...,1,0,0,0,1,0,0,0,1,0


In [22]:
train_df_encode.to_csv(train_out_path, index=False)
