In [201]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Dummy Submission (All-Positive)

In [202]:
def dummy_submit():
    submission_df = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
    submission_df['Transported'] = True
    submission_df.to_csv('submission.csv', index=False)
    
# This is dummy, not real submit, comment out when you don't want dummy
# dummy_submit()

# Pre-processing

In [224]:
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df = pd.read_csv('../input/spaceship-titanic/test.csv') # Does NOT contain y -> 'Transported'

# Replaces whitespace with NaN
train_df.replace(r'^\s+$', np.nan, regex=True) 
test_df.replace(r'^\s+$', np.nan, regex=True)

def splitData(dataframe):
    # Split "PassengerId" into "GroupId" & "IndividualId"
    dataframe["GroupId"] = dataframe["PassengerId"].str.split("_").str[0]
    dataframe["IndividualId"] = dataframe["PassengerId"].str.split("_").str[1]

    dataframe["GroupId"] = pd.to_numeric(dataframe["GroupId"], errors="ignore", downcast="integer")
    dataframe["IndividualId"] = pd.to_numeric(dataframe["IndividualId"], errors="ignore", downcast="integer")

    # Split "Cabin" into "Deck", "Number", & "Side"
    dataframe["Deck"] = dataframe["Cabin"].str.split("/").str[0]
    dataframe["Number"] = dataframe["Cabin"].str.split("/").str[1]
    dataframe["Side"] = dataframe["Cabin"].str.split("/").str[2]

    dataframe.shape
    dataframe.sort_index()

    return dataframe

train_df = splitData(train_df)
test_df = splitData(test_df)

In [287]:
numeric_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "GroupId", "Number"] 
# categorical_features = ["HomePlanet", "Destination", "Deck", "CryoSleep", "VIP", "Side"]
# drop_features = ["Name", "PassengerId", "IndividualId", "Cabin"]  # do not include these features in modeling
categorical_features = ["HomePlanet", "Destination", "Deck", "CryoSleep", "VIP", "Side", "IndividualId"]
drop_features = ["Name", "PassengerId", "Cabin"]  # do not include these features in modeling
passthrough_features = [] # do not apply any transformation

target = "Transported"

X_train = train_df.drop(columns=[target, *drop_features])
y_train = train_df[target]

X_test = test_df.drop(columns=[*drop_features]) # test_df does NOT contain target

In [270]:
def fillVIP(dataframe):
    BEFORE = dataframe["VIP"].isnull().sum() # The print statement below doesn't like this inside it.
    print(f"Number of empty VIP cells BEFORE: { BEFORE }")

    # No passenger under the age 18 is a VIP
    ageVIPIndex = dataframe[(dataframe["VIP"].isnull() == True) & (dataframe["Age"] < 18)][["VIP"]].index
    dataframe["VIP"][ageVIPIndex] = False

    # No passenger from Earth is a VIP
    earthVIPIndex = dataframe[(dataframe["VIP"].isnull() == True) & (dataframe["HomePlanet"] == "Earth")][["VIP"]].index
    dataframe["VIP"][earthVIPIndex] = False

    AFTER = dataframe["VIP"].isnull().sum()
    print(f"Number of empty VIP cells AFTER: { AFTER }")

In [278]:
def fillGroup(dataframe):
    HPBEFORE = dataframe["HomePlanet"].isnull().sum()
    DESTBEFORE = dataframe["Destination"].isnull().sum()
    print(f"Number of empty HomePlanet cells BEFORE: { HPBEFORE }")
    print(f"Number of empty Destination cells BEFORE: { DESTBEFORE }")

    # A passenger in a group will have the same HomePlanet and Destination as everyone else in their group
    homePlanetIndex = dataframe[(dataframe["HomePlanet"].isnull() == True)].index
    destinationIndex = dataframe[(dataframe["Destination"].isnull() == True)].index

    for i in homePlanetIndex:
        if (dataframe["GroupId"][i] == dataframe["GroupId"][i-1]):
            dataframe.at[i, "HomePlanet"] = dataframe.at[i-1, "HomePlanet"]
        elif (dataframe["GroupId"][i] == dataframe["GroupId"][i+1]):
            dataframe.at[i, "HomePlanet"] = dataframe.at[i+1, "HomePlanet"]
    for i in destinationIndex:
        if (dataframe["GroupId"][i] == dataframe["GroupId"][i-1]):
            dataframe.at[i, "Destination"] = dataframe.at[i-1, "Destination"]
        elif (dataframe["GroupId"][i] == dataframe["GroupId"][i+1]):
            dataframe.at[i, "Destination"] = dataframe.at[i+1, "Destination"]        

    # A passenger in a group will have the same Cabin as everyone else in their group
    DECKBEFORE = dataframe["Deck"].isnull().sum()
    print(f"Number of empty Cabin cells BEFORE: { DECKBEFORE }")
    
    deckIndex = dataframe[(dataframe["Deck"].isnull() == True)].index

    for i in deckIndex:
        if (dataframe["GroupId"][i] == dataframe["GroupId"][i-1]):
            dataframe.at[i, "Deck"] = dataframe.at[i-1, "Deck"]
            dataframe.at[i, "Number"] = dataframe.at[i-1, "Number"]
            dataframe.at[i, "Side"] = dataframe.at[i-1, "Side"]
        elif (dataframe["GroupId"][i] == dataframe["GroupId"][i+1]):
            dataframe.at[i, "Deck"] = dataframe.at[i+1, "Deck"]
            dataframe.at[i, "Number"] = dataframe.at[i+1, "Number"]
            dataframe.at[i, "Side"] = dataframe.at[i+1, "Side"]            
    
    HPAFTER = dataframe["HomePlanet"].isnull().sum()
    DESTAFTER = dataframe["Destination"].isnull().sum()
    DECKAFTER = dataframe["Deck"].isnull().sum()
    print(f"Number of empty HomePlanet cells AFTER: { HPAFTER }")
    print(f"Number of empty Destination cells AFTER: { DESTAFTER }")
    print(f"Number of empty Cabin cells AFTER: { DECKAFTER }")

In [285]:
def fillSpend(df):
    CRYOBEFORE = df["CryoSleep"].isnull().sum()
    print(f"Number of empty CryoSleep cells BEFORE: { CRYOBEFORE }")

    # If people are spending, they are not asleep
    isNotAsleep = df[(df["CryoSleep"].isnull() == True) & (
            (df["Spa"] > 0) | 
            (df["RoomService"] > 0) | 
            (df["FoodCourt"] > 0) | 
            (df["VRDeck"] > 0) | 
            (df["ShoppingMall"] > 0)) 
        ].index
    df["CryoSleep"][isNotAsleep] = False
    
    CRYOAFTER = df["CryoSleep"].isnull().sum()
    print(f"Number of empty CryoSleep cells AFTER: { CRYOAFTER }")

    # If people are aspleep, they are spending 0 money
    stores = ["Spa", "RoomService", "FoodCourt", "VRDeck", "ShoppingMall"]
    filledNull = 0
    for store in stores:
        storeBEFORE = df[store].isnull().sum()
        print(f"Number of empty { store } cells BEFORE: { storeBEFORE }")

        isAsleep = df[(df["CryoSleep"] == True) & (df[store].isnull() == True) ].index
        filledNull += isAsleep.shape[0]
        df[store][isAsleep] = 0

        storeAFTER = df[store].isnull().sum()
        print(f"Number of empty { store } cells AFTER: { storeAFTER }")

    # print(f"There are {df["CryoSleep"].isnull().sum()} null values left in sleep")
    print(f"Filled {filledNull} null spendings")

    # A passenger under the age of 13 won't have any money
    for store in stores:
        spendIndex = df[(df[store].isnull() == True) & (df["Age"] < 13)].index
        df[store][spendIndex] = 0

In [288]:
X_trainCPY = X_train.copy()
X_testCPY = X_test.copy()

fillVIP(X_trainCPY)
fillGroup(X_trainCPY)
fillSpend(X_trainCPY)

fillVIP(X_testCPY)
fillGroup(X_testCPY)
fillSpend(X_testCPY)

Number of empty VIP cells BEFORE: 203
Number of empty VIP cells AFTER: 76
Number of empty HomePlanet cells BEFORE: 201
Number of empty Destination cells BEFORE: 182
Number of empty Cabin cells BEFORE: 199
Number of empty HomePlanet cells AFTER: 111
Number of empty Destination cells AFTER: 103
Number of empty Cabin cells AFTER: 99
Number of empty CryoSleep cells BEFORE: 217
Number of empty CryoSleep cells AFTER: 98
Number of empty Spa cells BEFORE: 183
Number of empty Spa cells AFTER: 118
Number of empty RoomService cells BEFORE: 181
Number of empty RoomService cells AFTER: 113
Number of empty FoodCourt cells BEFORE: 183
Number of empty FoodCourt cells AFTER: 113
Number of empty VRDeck cells BEFORE: 188
Number of empty VRDeck cells AFTER: 126
Number of empty ShoppingMall cells BEFORE: 208
Number of empty ShoppingMall cells AFTER: 112
Filled 361 null spendings
Number of empty VIP cells BEFORE: 93
Number of empty VIP cells AFTER: 49
Number of empty HomePlanet cells BEFORE: 87
Number of em

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["VIP"][ageVIPIndex] = False
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["VIP"][earthVIPIndex] = False
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["CryoSleep"][isNotAsleep] = False
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[store][isAsleep] = 0
A value is trying to be set o

### Create Column Transformer

In [276]:
preprocessor = make_column_transformer(
    (make_pipeline(
        SimpleImputer(missing_values=np.nan, strategy="mean"),
        StandardScaler() 
        ), numeric_features 
    ),
    (make_pipeline(
        SimpleImputer(missing_values=np.nan, strategy="most_frequent"), 
        OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        ), categorical_features
    ),
    remainder='passthrough'  # Keep non-transformed columns
)

In [273]:


# from class code
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [277]:
knn = False

if knn :
    testpipeline = make_pipeline(preprocessor, KNeighborsClassifier())
else:
    testpipeline = make_pipeline(preprocessor, SVC())

testpipeline.fit(X_trainCPY, y_train)
y_test = testpipeline.predict(X_testCPY)

submission_df = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
submission_df['Transported'] = y_test
submission_df.to_csv('submission.csv', index=False)