In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [5]:
train = pd.read_csv("data/train.csv")
train.drop('Name', inplace=True, axis=1) # drop name as it is irrelevant here
train_X = train.drop('Transported', axis=1)
train_y = train['Transported']
train_X.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0


### Missing Data Handling

In [6]:
def handle_missing(data): # for those that do not need manipulation
    data[['CryoSleep', 'VIP']] = data[['CryoSleep', 'VIP']].fillna(False)
    data['HomePlanet'] = data['HomePlanet'].fillna(data['HomePlanet'].mode().iloc[0])
    data['Destination'] = data['Destination'].fillna(data['Destination'].mode().iloc[0])
    data['Age'] = data['Age'].fillna(data['Age'].median())

    numerical_var = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    data[numerical_var] = data[numerical_var].fillna(0)
    
    return data

handle_missing(train_X)
train_X.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
dtype: int64

### Pipeline

In [7]:
from sklearn.preprocessing import StandardScaler

# function to split passenger into group and id columns
def split_passenger(data, col):
    data[['Group', 'Id']] = data[col].str.split('_', expand=True)
    data[['Group', 'Id']].apply(pd.to_numeric, errors='coerce')
    data.drop(col, inplace=True, axis=1)
    return data

# function to create binary variable on whether passenger is alone or in a group, also handles hose with missing passengerid
def passenger_groups(data, col1, col2):
    counts = data.groupby(col1)[col2].transform('count')
    data[f'{col1}_True'] = data.apply(lambda x: 1 if counts.loc[x.name] > 1 and pd.notna(x[col2]) else 0, axis=1)
    data.drop([col1, col2], inplace=True, axis=1)
    return data

def dummies(data, list_cols):
    dummy = pd.get_dummies(data=data[list_cols], drop_first=True).astype(int)
    data = pd.concat([data.drop(columns = list_cols, axis=1), dummy], axis=1)
    return data

def continuous_log_normal(data, list_cols):
    data[list_cols] = np.log1p(data[list_cols])
    scaler = StandardScaler()
    data[list_cols] = scaler.fit_transform(data[list_cols])
    return data

def age_categories(data, col):
    data['Young_True'] = data[col] <= 15
    data['Old_True'] = data[col] > 45
    data.drop(col, axis=1, inplace=True)
    return data

def cabin_separator(data, col):
    data[['Deck', 'Number', 'Side']] = data[col].str.split('/', expand=True)
    data.drop(col, inplace=True, axis=1)
    data['Number'] = pd.to_numeric(data['Number'], errors='coerce')

    data['Number'] = data['Number'].fillna(data['Number'].median())
    s = StandardScaler()
    data['Number'] = s.fit_transform(data[['Number']])

    categorical = pd.get_dummies(data[['Deck', 'Side']], drop_first=True).astype(int)
    data_dummy = pd.concat([data.drop(columns=['Deck', 'Side'], axis=1), categorical], axis=1)
    return data_dummy
    

In [8]:
def modify_data(df):
    split_passenger(df, 'PassengerId')
    passenger_groups(df, 'Group', 'Id')
    df = dummies(df, ['HomePlanet', 'Destination', 'CryoSleep', 'VIP'])
    df = continuous_log_normal(df, ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])
    df = age_categories(df, 'Age')
    df = cabin_separator(df, 'Cabin')
    return df

train_X = modify_data(train_X)

In [9]:
train_X.head()

Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group_True,CryoSleep,VIP,HomePlanet_Europa,HomePlanet_Mars,...,Old_True,Number,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_S
0,-0.638181,-0.65008,-0.622995,-0.664035,-0.640034,0,0,0,1,0,...,False,-1.177238,1,0,0,0,0,0,0,0
1,1.090491,0.13504,0.646081,1.614565,0.745894,0,0,0,0,0,...,False,-1.177238,0,0,0,0,1,0,0,1
2,0.753511,2.139858,-0.622995,2.518191,0.784254,1,0,1,1,0,...,True,-1.177238,0,0,0,0,0,0,0,1
3,-0.638181,1.790516,1.6825,2.264863,1.277886,1,0,0,1,0,...,False,-1.177238,0,0,0,0,0,0,0,1
4,1.464342,0.80338,1.333879,1.62492,-0.240051,0,0,0,0,0,...,False,-1.175264,0,0,0,0,1,0,0,1


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

log = LogisticRegression()
log.fit(train_X, train_y)

In [11]:
test = pd.read_csv("data/test.csv")
test_X = test.drop('Name', axis=1) # drop name as it is irrelevant here

handle_missing(test_X)
test_X = modify_data(test_X)
predictions = log.predict(test_X)

In [18]:
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': predictions
})

In [19]:
submission.to_csv('log_submission.csv', index=False)