In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Dummy Submission (All-Positive)

In [2]:
def dummy_submit():
    submission_df = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
    submission_df['Transported'] = True
    submission_df.to_csv('submission.csv', index=False)
    
# This is dummy, not real submit, comment out when you don't want dummy
# dummy_submit()

# Pre-processing

In [3]:
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df = pd.read_csv('../input/spaceship-titanic/test.csv') # Does NOT contain y -> 'Transported'

# Replaces whitespace with NaN
train_df.replace(r'^\s+$', np.nan, regex=True) 
test_df.replace(r'^\s+$', np.nan, regex=True)

def splitData(dataframe):
    # Split "PassengerId" into "GroupId" & "IndividualId"
    dataframe["GroupId"] = dataframe["PassengerId"].str.split("_").str[0]
    dataframe["IndividualId"] = dataframe["PassengerId"].str.split("_").str[1]

    # Split "Cabin" into "Deck", "Number", & "Side"
    dataframe["Deck"] = dataframe["Cabin"].str.split("/").str[0]
    dataframe["Number"] = dataframe["Cabin"].str.split("/").str[1]
    dataframe["Side"] = dataframe["Cabin"].str.split("/").str[2]

    dataframe.shape
    dataframe.sort_index()

    return dataframe

train_df = splitData(train_df)
test_df = splitData(test_df)

In [4]:
numeric_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "GroupId", "Number"] 
categorical_features = ["HomePlanet", "Destination", "Deck"] 
ordinal_features = [] 
binary_features = ["CryoSleep", "VIP", "Side"] 
drop_features = ["Name", "PassengerId", "IndividualId", "Cabin"]  # do not include these features in modeling
passthrough_features = [] # do not apply any transformation

target = "Transported"

X_train = train_df.drop(columns=[target, *drop_features])
y_train = train_df[target]

X_test = test_df.drop(columns=[*drop_features]) # test_df does NOT contain target

In [5]:
def fillNaN(dataframe):
    dataframe = fillNumeric(dataframe)
    dataframe = fillCategorical(dataframe)
    dataframe = fillBinary(dataframe)

    return dataframe

def fillNumeric(dataframe):
    for item in numeric_features:
        dataframe[item] = dataframe[item].fillna(dataframe[item].median())
    return dataframe

def fillCategorical(dataframe):
    # TODO: doesn't work for non-numeric values (most likely mode())
    for item in categorical_features:
        dataframe[item] = dataframe[item].fillna(dataframe[item].mode())
    return dataframe

def fillBinary(dataframe):
    # TODO: doesn't work for non-numeric values (most likely mode())
    for item in binary_features:
        dataframe[item] = dataframe[item].fillna(dataframe[item].mode())
    return dataframe

X_train.to_csv('X_train_before.csv', index=False)
X_test.to_csv('X_test_before.csv', index=False)

X_train = fillNaN(X_train)
X_test = fillNaN(X_test)

X_train.to_csv('X_train_after.csv', index=False)
X_test.to_csv('X_test_after.csv', index=False)

### Create Column Transformer

In [6]:
preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features), 
    (make_pipeline(
        SimpleImputer(strategy="constant", fill_value="unknown"), 
        OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        ), categorical_features
    ),
    (OneHotEncoder(sparse_output=False, drop="if_binary"), binary_features),
    remainder='passthrough'  # Keep non-transformed columns
)