In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Dummy Submission (All-Positive)

In [None]:
def dummy_submit():
    submission_df = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
    submission_df['Transported'] = True
    submission_df.to_csv('submission.csv', index=False)
    
# This is dummy, not real submit, comment out when you don't want dummy
# dummy_submit()

# Pre-processing

In [None]:
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')

train_df.replace(r'^\s+$', np.nan, regex=True) # Replaces whitespace with NaN

# TODO: Change if necessary
train_df = train_df.dropna() # Drop all cells with NaN

# Split "PassengerId" into "GroupId" & "IndividualId"
train_df["GroupId"] = train_df["PassengerId"].str.split("_").str[0]
train_df["IndividualId"] = train_df["PassengerId"].str.split("_").str[1]

# Split "Cabin" into "Deck", "Number", & "Side"
train_df["Deck"] = train_df["Cabin"].str.split("/").str[0]
train_df["Number"] = train_df["Cabin"].str.split("/").str[1]
train_df["Side"] = train_df["Cabin"].str.split("/").str[2]

train_df.shape
train_df.sort_index()

### Create Column Transformer

In [None]:
numeric_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "GroupId", "Number"] 
categorical_features = ["HomePlanet", "Destination", "Deck"] 
ordinal_features = [] 
binary_features = ["CryoSleep", "VIP", "Side"] 
drop_features = ["Name", "PassengerId", "IndividualId", "Cabin"]  # do not include these features in modeling
passthrough_features = [] # do not apply any transformation

target = "Transported"

preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features), 
    (make_pipeline(
        SimpleImputer(strategy="constant", fill_value="unknown"), 
        OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        ), categorical_features
    ),
    (OneHotEncoder(sparse_output=False, drop="if_binary"), binary_features),
    remainder='passthrough'  # Keep non-transformed columns
)

X_train = train_df.drop(columns=[target, *drop_features])
y_train = train_df[target]

print(f"X train: \n{X_train}\n")
print(f"y train: \n{y_train}\n")

In [None]:
# from class code
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [None]:
results_dict = {}

models = {
    "decision tree": DecisionTreeClassifier(),
    "kNN": KNeighborsClassifier(),
    "RBF SVM": SVC(),
}

for item in models:
    testpipline = make_pipeline(preprocessor, models[item])

    results_dict[f"preprocessing w/ {item}"] = mean_std_cross_val_scores(testpipline, X_train, y_train)

pd.DataFrame(results_dict)