In [1]:
from pathlib import Path
import pandas as pd

In [2]:
train_path = Path("../data/processed/train_cleaned.csv").resolve()
test_path = Path("../data/processed/test_cleaned.csv").resolve()

out_dir = Path("../data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

train_out_path = out_dir / "train_features.csv"
test_out_path = out_dir / "test_features.csv"

In [3]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

CabinDeck is included as a candidate feature because deck location may encode passenger location/class information that could relate to survival.

In [4]:
def create_cabindeck_feat(df):
    df = df.copy()
    df["CabinDeck"] = (
        df["Cabin"]
        .str.extract(r"([A-Za-z])", expand=False)
        .str.upper()
        .fillna("Unknown")
    )
    return df

FamilySize is included as a candidate feature because traveling alone versus with family may capture behavior or access patterns that could relate to survival.

In [5]:
def create_familysize_feat(df):
    df = df.copy()
    df['FamilySize']=df['SibSp']+df['Parch']+1
    return df

In [6]:
train_df = create_cabindeck_feat(train_df)
train_df = create_familysize_feat(train_df)

test_df = create_cabindeck_feat(test_df)
test_df = create_familysize_feat(test_df)

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,CabinDeck,FamilySize
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,Unknown,2
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,2,C,2
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Unknown,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,2,C,2
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,Unknown,1


In [7]:
train_df.to_csv(train_out_path, index=False)
test_df.to_csv(test_out_path, index=False)

Create a new column to classify the passengers into different age groups

In [8]:
def age_binning(df):
    # Define age bins and labels
    bins = [0, 12, 19, 39, 59, float("inf")]
    labels = ['Child', 'Teen', 'Adult', 'Middle Aged', 'Senior']

    # Create age_bin column
    df['AgeBin'] = pd.cut(
        df['Age'],
        bins=bins,
        labels=labels,
        include_lowest=True
    )
    return df

In [9]:
train_df = age_binning(train_df)
test_df = age_binning(test_df)

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,CabinDeck,FamilySize,AgeBin
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,Unknown,2,Adult
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,2,C,2,Adult
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Unknown,1,Adult
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,2,C,2,Adult
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,Unknown,1,Adult


In [10]:
train_df.to_csv(train_out_path, index=False)
test_df.to_csv(test_out_path, index=False)