In [1]:
from pathlib import Path
import pandas as pd

In [2]:
train_path = Path("../data/processed/train_cleaned.csv").resolve()

out_dir = Path("../data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

train_out_path = out_dir / "train_features.csv"


In [3]:
train_df = pd.read_csv(train_path)

CabinDeck is included as a candidate feature because deck location may encode passenger location/class information that could relate to survival.

In [4]:
def create_cabindeck_feat(df):
    df = df.copy()
    df["CabinDeck"] = (
        df["Cabin"]
        .str.extract(r"([A-Za-z])", expand=False)
        .str.upper()
        .fillna("Unknown")
    )
    return df

FamilySize is included as a candidate feature because traveling alone versus with family may capture behavior or access patterns that could relate to survival.

In [5]:
def create_familysize_feat(df):
    df = df.copy()
    df['FamilySize']=df['SibSp']+df['Parch']+1
    return df

Create a new column to classify the passengers into different age groups

In [6]:
def age_binning(df):
    # Define age bins and labels
    bins = [0, 12, 19, 39, 59, float("inf")]
    labels = ['Child', 'Teen', 'Adult', 'Middle Aged', 'Senior']

    # Create age_bin column
    df['AgeBin'] = pd.cut(
        df['Age'],
        bins=bins,
        labels=labels,
        include_lowest=True
    )
    return df

In [19]:
train_df = age_binning(train_df)
train_df = create_cabindeck_feat(train_df)
train_df = create_familysize_feat(train_df)

# Add One-Hot encoding to AgeBin, Embarked, Title
train_df_dummies = pd.get_dummies(train_df,columns=["Pclass","AgeBin","Embarked","Title"],dtype=int,drop_first=True)
train_df_encode = pd.concat([train_df, train_df_dummies],axis =1)

train_df_encode[["Pclass","Pclass_2","Pclass_3"]].head(10)

Unnamed: 0,Pclass,Pclass_2,Pclass_3
0,3,0,1
1,1,0,0
2,3,0,1
3,1,0,0
4,3,0,1
5,3,0,1
6,1,0,0
7,3,0,1
8,3,0,1
9,2,1,0


In [None]:
train_df_encode.to_csv(train_out_path, index=False)
