## **Imports**

In [20]:
import os
import pandas as pd

from sklearn.model_selection import train_test_split

## **Paths & Global Variables**

In [21]:
ROOT_PATH = r"C:\Users\mario\OneDrive\Documents\Work\Side Hustles\Kaggle\titanic"

import sys

sys.path.append(ROOT_PATH)

from titanic.config import PROCESSED_DATA_DIR, MODELS_DIR

## **Reading Data**

In [22]:
os.chdir(PROCESSED_DATA_DIR)
final_df = pd.read_parquet("titanic_processed.parquet")
final_df.head(1)

Unnamed: 0_level_0,siblings_spouses,parents_children,num_rooms,age,fare,survived,passenger_class,full_name,ticket,cabin,...,cabin_level_1_F,cabin_level_1_G,cabin_level_1,full_name_target,surname_target,title_target,first_name_target,cabin_target,ticket_target,cabin_level_2_target
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,22.0,7.25,False,Lower,"Braund, Mr. Owen Harris",A/5 21171,Unknown,...,False,True,G,0.260287,0.260287,0.112709,0.260038,0.205975,0.260038,0.201034


In [23]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1307 entries, 1 to 1309
Data columns (total 50 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   siblings_spouses      1307 non-null   int64   
 1   parents_children      1307 non-null   int64   
 2   num_rooms             1307 non-null   int64   
 3   age                   1307 non-null   float64 
 4   fare                  1307 non-null   float64 
 5   survived              1307 non-null   bool    
 6   passenger_class       1307 non-null   category
 7   full_name             1307 non-null   string  
 8   ticket                1307 non-null   string  
 9   cabin                 1307 non-null   string  
 10  surname               1307 non-null   string  
 11  title                 1307 non-null   string  
 12  first_name            1307 non-null   string  
 13  cabin_level_2         1307 non-null   string  
 14  split                 1307 non-null   object  
 15  age_scale

In [24]:
final_df.columns

Index(['siblings_spouses', 'parents_children', 'num_rooms', 'age', 'fare',
       'survived', 'passenger_class', 'full_name', 'ticket', 'cabin',
       'surname', 'title', 'first_name', 'cabin_level_2', 'split',
       'age_scaled', 'age_min_max', 'age_group', 'fare_log', 'age_group_ord',
       'passenger_class_ord', 'full_name_lcode', 'surname_lcode',
       'title_lcode', 'first_name_lcode', 'cabin_lcode', 'ticket_lcode',
       'cabin_level_2_lcode', 'sex_lcode', 'embarked_lcode',
       'cabin_level_1_lcode', 'sex_male', 'sex', 'embarked_Q', 'embarked_S',
       'embarked', 'cabin_level_1_B', 'cabin_level_1_C', 'cabin_level_1_D',
       'cabin_level_1_E', 'cabin_level_1_F', 'cabin_level_1_G',
       'cabin_level_1', 'full_name_target', 'surname_target', 'title_target',
       'first_name_target', 'cabin_target', 'ticket_target',
       'cabin_level_2_target'],
      dtype='object')

# **Training, Testing, Validation Split**

In [25]:
numerical_d_features = [
    "siblings_spouses",
    "parents_children",
    "num_rooms",
]

final_df[numerical_d_features].head(1)

Unnamed: 0_level_0,siblings_spouses,parents_children,num_rooms
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0,0


In [26]:
numerical_c_features = [
    "age",
    "fare",
]

final_df[numerical_c_features].head(1)

Unnamed: 0_level_0,age,fare
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,22.0,7.25


In [27]:
numerical_c_scaled_features = [
    "age_scaled",
    "age_min_max",
    "fare_log",
]

final_df[numerical_c_scaled_features].head(1)

Unnamed: 0_level_0,age_scaled,age_min_max,fare_log
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,-0.560196,0.273456,2.110213


In [28]:
binary_features = [
    "survived",
]

final_df[binary_features].head(1)

Unnamed: 0_level_0,survived
passenger_id,Unnamed: 1_level_1
1,False


In [29]:
ordinal_features = [
    "passenger_class",
    "age_group",
]

final_df[ordinal_features].head(1)

Unnamed: 0_level_0,passenger_class,age_group
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Lower,Young Adult


In [30]:
ordinal_encoded_features = [
    "passenger_class_ord",
    "age_group_ord",
]

final_df[ordinal_encoded_features].head(1)

Unnamed: 0_level_0,passenger_class_ord,age_group_ord
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,0


In [31]:
nominal_features = [
    "full_name",
    "ticket",
    "cabin",
    "surname",
    "title",
    "first_name",
    "cabin_level_2",
    "sex",
    "embarked",
]

final_df[nominal_features].head(1)

Unnamed: 0_level_0,full_name,ticket,cabin,surname,title,first_name,cabin_level_2,sex,embarked
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,"Braund, Mr. Owen Harris",A/5 21171,Unknown,Braund,Mr,Owen Harris,0,male,S


In [32]:
label_encoded_features = [
    "full_name_lcode",
    "surname_lcode",
    "title_lcode",
    "first_name_lcode",
    "cabin_lcode",
    "ticket_lcode",
    "cabin_level_2_lcode",
    # "sex_lcode",
    # "embarked_lcode",
    # "cabin_level_1_lcode",
]

final_df[label_encoded_features].head(1)

Unnamed: 0_level_0,full_name_lcode,surname_lcode,title_lcode,first_name_lcode,cabin_lcode,ticket_lcode,cabin_level_2_lcode
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,155,100,12,853,185,719,0


In [33]:
one_hot_encoded_features = [
    # "embarked_C",
    "embarked_Q",
    "embarked_S",
    "sex_male",
    # "cabin_level_1_A",
    "cabin_level_1_B",
    "cabin_level_1_C",
    "cabin_level_1_D",
    "cabin_level_1_E",
    "cabin_level_1_F",
    "cabin_level_1_G",
]

final_df[one_hot_encoded_features].head(1)

Unnamed: 0_level_0,embarked_Q,embarked_S,sex_male,cabin_level_1_B,cabin_level_1_C,cabin_level_1_D,cabin_level_1_E,cabin_level_1_F,cabin_level_1_G
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,False,True,True,False,False,False,False,False,True


In [34]:
target_encoded_features = [
    "full_name_target",
    "surname_target",
    "title_target",
    "first_name_target",
    "cabin_target",
    "ticket_target",
    "cabin_level_2_target",
]

final_df[target_encoded_features].head(5)

Unnamed: 0_level_0,full_name_target,surname_target,title_target,first_name_target,cabin_target,ticket_target,cabin_level_2_target
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.260287,0.260287,0.112709,0.260038,0.205975,0.260038,0.201034
2,0.260287,0.0,0.4784,0.260038,0.0,0.260038,0.0
3,0.260038,0.260038,0.477756,0.260038,0.205282,0.260287,0.207034
4,0.260287,0.0,0.513829,0.260038,0.0,0.0,0.0
5,0.260038,1.0,0.106086,0.252014,0.197838,0.260287,0.201034


In [35]:
target_variable = "survived"
selected_features = (
    numerical_d_features
    # + numerical_c_features
    + numerical_c_scaled_features
    # + binary_features
    # + ordinal_features
    + ordinal_encoded_features
    # + nominal_features
    # + label_encoded_features
    + one_hot_encoded_features
    # + target_encoded_features
)

y = final_df[final_df["split"] == "train"][target_variable]
X = final_df[final_df["split"] == "train"][selected_features]

y_test = final_df[final_df["split"] == "test"][target_variable]
X_test = final_df[final_df["split"] == "test"][selected_features]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(len(X_train), len(y_train))
print(len(X_val), len(y_val))
print(len(X_test), len(y_test))

711 711
178 178
418 418


## **Positivity Rate**

In [36]:
print("Train positivity rate: ")
print(sum(y_train) / y_train.shape[0])
print("Test positivity rate: ")
print(sum(y_val) / y_val.shape[0])

Train positivity rate: 
0.38115330520393814
Test positivity rate: 
0.38764044943820225


## **Stratify Sampling**

In [37]:
# Stratify on y due to class imbalance
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Stratified train positivity rate: ")
print(sum(y_train) / y_train.shape[0])

print("Stratified test positivity rate: ")
print(sum(y_val) / y_val.shape[0])

Stratified train positivity rate: 
0.38255977496483823
Stratified test positivity rate: 
0.38202247191011235


In [38]:
os.chdir(PROCESSED_DATA_DIR)

X_train.to_parquet("X_train.parquet")
X_val.to_parquet("X_val.parquet")
X_test.to_parquet("X_test.parquet")

y_train.to_frame().to_parquet("y_train.parquet")
y_val.to_frame().to_parquet("y_val.parquet")
y_test.to_frame().to_parquet("y_test.parquet")