In [70]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Dummy Submission (All-Positive)

In [71]:
def dummy_submit():
    submission_df = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
    submission_df['Transported'] = True
    submission_df.to_csv('submission.csv', index=False)

# This is dummy, not real submit, comment out when you don't want dummy
# dummy_submit()

# Pre-processing

In [72]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv') # Does NOT contain y -> 'Transported'

# Replaces whitespace with NaN
train_df.replace(r'^\s+$', np.nan, regex=True)
test_df.replace(r'^\s+$', np.nan, regex=True)

def splitData(dataframe):
    # Split "PassengerId" into "GroupId" & "IndividualId"
    dataframe["GroupId"] = dataframe["PassengerId"].str.split("_").str[0]
    dataframe["IndividualId"] = dataframe["PassengerId"].str.split("_").str[1]

    # Split "Cabin" into "Deck", "Number", & "Side"
    dataframe["Deck"] = dataframe["Cabin"].str.split("/").str[0]
    dataframe["Number"] = dataframe["Cabin"].str.split("/").str[1]
    dataframe["Side"] = dataframe["Cabin"].str.split("/").str[2]

    dataframe.shape
    dataframe.sort_index()

    return dataframe

train_df = splitData(train_df)
test_df = splitData(test_df)

In [73]:
numeric_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "GroupId", "Number"]
categorical_features = ["HomePlanet", "Destination", "Deck", "CryoSleep", "VIP", "Side"]
drop_features = ["Name", "PassengerId", "IndividualId", "Cabin"]  # do not include these features in modeling
passthrough_features = [] # do not apply any transformation

target = "Transported"

X_train = train_df.drop(columns=[target, *drop_features])
y_train = train_df[target]

X_test = test_df.drop(columns=[*drop_features]) # test_df does NOT contain target

In [74]:
def fillSpend(dataframe, feature):
    spend_index = dataframe[
      (dataframe[feature].isnull() == True) &
      ((dataframe["Age"]<13) | (dataframe["CryoSleep"] == True))
    ].index
    dataframe[feature][spend_index] = 0


testTrain = train_df
testTest = test_df
AllData = pd.concat([train_df, test_df], ignore_index=True)

def run(df):
  print(df["Spa"].isnull().sum())
  print(df["VRDeck"].isnull().sum())
  print(df["ShoppingMall"].isnull().sum())
  print(df["FoodCourt"].isnull().sum())
  print(df["RoomService"].isnull().sum())

  spend_list = ["Spa", "RoomService", "FoodCourt", "VRDeck", "ShoppingMall"]
  for i in spend_list:
      fillSpend(df, i)

  print("--------")
  print(df["Spa"].isnull().sum())
  print(df["VRDeck"].isnull().sum())
  print(df["ShoppingMall"].isnull().sum())
  print(df["FoodCourt"].isnull().sum())
  print(df["RoomService"].isnull().sum(),"\n")


print("First, train_df")
run(testTrain)

print("Next, test_df")
run(testTest)

First, train_df
183
188
208
183
181
--------
114
107
103
106
107 

Next, test_df
101
80
98
106
82
--------
52
43
60
65
55 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[feature][spend_index] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[feature][spend_index] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[feature][spend_index] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[feature][spend_index] = 0
A value is trying to be 

### Create Column Transformer

In [17]:
preprocessor = make_column_transformer(
    (make_pipeline(
        SimpleImputer(missing_values=np.nan, strategy="mean"),
        StandardScaler()
        ), numeric_features
    ),
    (make_pipeline(
        SimpleImputer(missing_values=np.nan, strategy="most_frequent"),
        OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        ), categorical_features
    ),
    remainder='passthrough'  # Keep non-transformed columns
)

In [27]:


# from class code
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    fillSpend(std_scores)
    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [12]:
knn = False

if knn :
    testpipeline = make_pipeline(preprocessor, KNeighborsClassifier())
else:
    testpipeline = make_pipeline(preprocessor, SVC())

testpipeline.fit(X_train, y_train)
y_test = testpipeline.predict(X_test)

submission_df = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
submission_df['Transported'] = y_test
submission_df.to_csv('submission.csv', index=False)