In [None]:
import numpy as np
import pandas as pd
import os
import warnings

warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.3f}'.format
SEED = 13

## Load Data

In [None]:
train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# Divide inputs and outputs for both training and testing
X_train = train_df.drop('Transported', axis=1)
y_train = train_df['Transported'].copy()

X_test = test_df.copy()

Xs = [X_train, X_test]

## Preprocess Data

### Fill in missing values

In [None]:
def fill_journey(df):
    """
    Fills nulls in 'HomePlanet' and 'Destination' as below:
    
    Home Planet:
    a. Fill with group's most common 'HomePlanet'
    b. If all group members have null values, it's filled with mode
    
    Destination:
    a. Fills with group's most common 'Destination'
    b. If all group members have null values, it's filled with mode
    """
#     home_nan = df[df['HomePlanet'].isnull()].index.tolist()
#     dest_nan = df[df['Destination'].isnull()].index.tolist()
#     display(df.iloc[home_nan + dest_nan])

    # Get most common HomePlanet and Destination for everyone
    most_common_home_planet = df['HomePlanet'].mode().values[0]
    most_common_dest = df['Destination'].mode().values[0]
    
    # Get family most common journey
    journey = df.fillna(value={'HomePlanet': most_common_home_planet, 'Destination': most_common_dest})
    journey = journey.groupby(['Group'], as_index=False)['Group', 'HomePlanet', 'Destination'].agg(pd.Series.mode).reindex(columns=df.columns)
    
    journey['HomePlanet'] = [el[0] if not isinstance(el, str) and el.size > 1 else el for el in journey['HomePlanet']]
    journey['Destination'] = [el[0] if not isinstance(el, str) and el.size > 1 else el for el in journey['Destination']]
    
    # Fill in null values
    df = df.merge(journey, how='left', left_on='Group', right_on= 'Group', suffixes= ('', '_'))
    df['HomePlanet'] = df['HomePlanet'].fillna(df.HomePlanet_);
    df['Destination'] = df['Destination'].fillna(df.Destination_);
    
#     display(df.iloc[home_nan + dest_nan])
    
    return df[['HomePlanet', 'Destination']]

In [None]:
def fill_cryosleep(df):
    """
    Fills nulls in 'CryoSleep' as below:
    
    a. For non-spenders, 'CryoSleep' = True as CryoSleep customers don't spend
    b. For spenders, 'CryoSleep' = False
    """    
#     nan = df[df['CryoSleep'].isnull()].index.tolist()
#     display(df.iloc[nan])
    
    # Calculate 'TotalSpent' by person
    df['TotalSpent'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    
    # Fill in null values
    df.loc[df['CryoSleep'].isna() & df['TotalSpent'] == 0.0, ['CryoSleep']] = True
    df.loc[df['CryoSleep'].isna() & df['TotalSpent'] > 0.0, ['CryoSleep']] = False
    
#     display(df.iloc[nan])
    
    return df[['CryoSleep']]

In [None]:
def fill_cabin(df):
    """
    Fills nulls in 'Cabin' as below:
    
    a. Fill with group's most common 'Cabin'
    b. If all group members have null values, it's filled with mode
    """
#     nan = df[df['Cabin'].isnull()].index.tolist()
#     display(df.iloc[nan])
        
    # Get most common 'Cabin' for everyone
    most_common_cabin = df['Cabin'].mode().values[0]

    # Get most common family 'Cabin'
    cabin_per_family = df.fillna(value={'Cabin': most_common_cabin})
    cabin_per_family = cabin_per_family.dropna(subset=['Group'])
    cabin_per_family = cabin_per_family.groupby(['Group'], as_index=False)['Group', 'Cabin'].agg(pd.Series.mode).reindex(columns=df.columns)
    
    cabin_per_family['Cabin'] = [el[0] if not isinstance(el, str) and el.size > 1 else el for el in cabin_per_family['Cabin']]
    
    # Fill in null values
    df = df.merge(cabin_per_family, how='left', left_on='Group', right_on='Group', suffixes=('', '_'))
    df['Cabin'] = df['Cabin'].fillna(df.Cabin_);
    
#     display(df.iloc[nan])
    
    return df[['Cabin']]

In [None]:
def fill_VIP(df):
    """
    Fills nulls in 'VIP' as below:
    
    a. Fill with group's most common 'VIP'
    b. If all group members have null values, it's filled with mode
    """
#     nan = df[df['VIP'].isnull()].index.tolist()
#     display(df.iloc[nan])
    
    orig_cols = df.columns
    
    # Get most common 'VIP' for everyone
    most_common_VIP = df['VIP'].mode().values[0]

    # Get most common family 'VIP'
    VIP_per_family = df.fillna(value={'VIP': most_common_VIP})
    VIP_per_family = VIP_per_family.dropna(subset=['Group'])
    VIP_per_family = VIP_per_family.groupby(['Group'], as_index=False)['Group', 'VIP'].agg(pd.Series.mode).reindex(columns=df.columns)
    
    VIP_per_family['VIP'] = [el[0] if not isinstance(el, bool) and el.size > 1 else el for el in VIP_per_family['VIP']]
    
    # Fill in null values
    df = df.merge(VIP_per_family, how='left', left_on='Group', right_on='Group', suffixes=('', '_'))
    df['VIP'] = df['VIP'].fillna(df.VIP_);
    
#     display(df.iloc[nan])

    return df[['VIP']]

### Feature engineering

In [None]:
# Fill null values for numerical features with mean
for X in Xs:
    X[['RoomService']] = X[['RoomService']].fillna(value=X[['RoomService']].mean());
    X[['FoodCourt']] = X[['FoodCourt']].fillna(value=X[['FoodCourt']].mean());
    X[['ShoppingMall']] = X[['ShoppingMall']].fillna(value=X[['ShoppingMall']].mean());
    X[['Spa']] = X[['Spa']].fillna(value=X[['Spa']].mean());
    X[['VRDeck']] = X[['VRDeck']].fillna(value=X[['VRDeck']].mean());
    X[['Age']] = X[['Age']].fillna(value=X[['Age']].mean());

# Getting group. Also add a new feature 'GroupSize'
for X in Xs:
    X['Group'] = X['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
    X['GroupSize'] = X['Group'].map(lambda x: X['Group'].value_counts()[x])
    
# Splitting Name in FirstName and LastName
for X in Xs:
    X['FirstName'] = X['Name'].apply(lambda x: x.split(' ')[0] if (str(x)) != 'nan' else x)
    X['LastName'] = X['Name'].apply(lambda x: x.split(' ')[1] if (str(x)) != 'nan' else x)

# Adding new feature 'WithGroup'
for X in Xs:
    X['WithGroup'] = 1
    X.loc[X['GroupSize'] == 1, 'WithGroup'] = 0

for X in Xs:
    X[['HomePlanet', 'Destination']] = fill_journey(X)
    X[['CryoSleep']] = fill_cryosleep(X)
    X[['VIP']] = fill_VIP(X)
    X[['Cabin']] = fill_cabin(X)
    
# Splitting Cabin into Deck, Num, and Side
for X in Xs:
    X['Deck'] = X['Cabin'].apply(lambda x: x.split('/')[0])
    X['Num'] = X['Cabin'].apply(lambda x: int(x.split('/')[1]))
    X['Side'] = X['Cabin'].apply(lambda x: x.split('/')[2])

# Drop redundant features
for X in Xs:
    X.drop(['PassengerId', 'Cabin', 'FirstName', 'Name', 'LastName'], axis=1, inplace=True)
#     X = X[X['LastName'].notna()]
    
display(X_train.head(5))

In [None]:
for X in Xs:
    # Splitting Cabin into Deck, Num, and Side
    X[['Deck', 'Num', 'Side']] = X['Cabin'].str.split('/', expand=True)
    
    # Getting group. Also add a new feature 'GroupSize'
    X['Group'] = X['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
    X['GroupSize'] = X['Group'].map(lambda x: X['Group'].value_counts()[x])
    X.loc[X['GroupSize'] == 1, 'WithGroup'] = 0
    
    # Fill nan values for numerical features with their mean
    X['RoomService'] = X[['RoomService']].fillna(value=X[['RoomService']].mean());
    X['FoodCourt'] = X[['FoodCourt']].fillna(value=X[['FoodCourt']].mean());
    X['ShoppingMall'] = X[['ShoppingMall']].fillna(value=X[['ShoppingMall']].mean());
    X['Spa'] = X[['Spa']].fillna(value=X[['Spa']].mean());
    X['VRDeck'] = X[['VRDeck']].fillna(value=X[['VRDeck']].mean());
    X['Age'] = X[['Age']].fillna(value=X[['Age']].mean());
    X['Num'] = X['Num'].astype(float)
    X['Num']= X['Num'].fillna(X['Num'].mean())

    # Fill nan values for categorical features with their mode
    X['HomePlanet'] = X['HomePlanet'].fillna(X['HomePlanet'].mode())
    X['Destination'] = X['Destination'].fillna(X['Destination'].mode())
    X['CryoSleep'] = X['CryoSleep'].fillna(X['CryoSleep'].mode())
    X['VIP'] = X['VIP'].fillna(X['VIP'].mode())
    X['Deck']= X['Deck'].fillna(X['Deck'].mode())
    X['Side']= X['Side'].fillna(X['Side'].mode())

    # Drop redundant features
    X.drop(['PassengerId', 'Cabin', 'Name'], axis=1, inplace=True)
    
display(X_train.head(5))

### Normalization

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

scaler = StandardScaler()
label_enc = LabelEncoder()

# Get categorical features
categ_cols = ['CryoSleep', 'VIP', 'Side', 'WithGroup']

# Encode categorical features
for X in Xs:
    for i in categ_cols:
        X[i] = label_enc.fit_transform(X[i])

# Encode output
y_train = label_enc.fit_transform(y_train)

# Get numerical columns
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group', 'GroupSize', 'Num']

# Normalize numerical columns
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# One hot encoding for rest of categorical columns (non-ordered columns)
to_onehot = ['HomePlanet', 'Destination', 'Deck']

X_train = pd.get_dummies(X_train, columns=to_onehot)
X_test = pd.get_dummies(X_test, columns=to_onehot)

X_train.head(5)

## Training

### Finding best hyperparameters for LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

log_reg_grid = [
    {
        'penalty': ['l1'],
        'solver': ['liblinear', 'saga'],
        'max_iter': [100, 500, 1000, 1500]
    },
    {
        'penalty': ['l2'],
        'solver': ['liblinear', 'sag', 'saga', 'newton-cg', 'lbfgs'],
        'max_iter': [100, 500, 1000, 1500]
    },
]

log_reg_optimal = GridSearchCV(LogisticRegression(), log_reg_grid, scoring = 'accuracy')
log_reg_optimal.fit(X_train, y_train)
print(log_reg_optimal.best_score_)
print(log_reg_optimal.best_params_)

# 0.7883759495453978
# {'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}

### Finding best hyperparameters for SVC

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svc_grid = [{
    'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
    'kernel': ['linear', 'poly', 'rbf'],
    'gamma': ['scale', 'auto']
}]

svc_optimal = GridSearchCV(SVC(random_state=SEED), svc_grid, scoring = 'accuracy')
svc_optimal.fit(X_train, y_train)
print(svc_optimal.best_score_)
print(svc_optimal.best_params_)

# 0.7950397454823227
# {'C': 0.75, 'gamma': 'scale', 'kernel': 'rbf'}

### Finding best hyperparameters for MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

# mlp_grid = [{
#     'activation' : ['identity', 'logistic', 'tanh', 'relu'],
#     'solver' : ['lbfgs', 'sgd', 'adam'],
#     'hidden_layer_sizes': [
#         (128,),(256,),(256,128,),(128,64,),(128,64,32,)
#     ],
#     'learning_rate': ['constant', 'invscaling', 'adaptive'],
#     'momentum': [0.9, 0.8, 0.7]
# }]

mlp_grid = [{
    'hidden_layer_sizes': [
        (512,256,128,), (128,64,32,),
    ],
    'learning_rate': ['constant', 'invscaling'],
    'momentum': [0.9, 0.85, 0.8]
}]

mlp_optimal = GridSearchCV(MLPClassifier(), mlp_grid, cv=3,scoring='accuracy')
mlp_optimal.fit(X_train, y_train)
print(mlp_optimal.best_score_)
print(mlp_optimal.best_params_)

# 0.7939751740841658
# {'activation': 'logistic', 'hidden_layer_sizes': (512, 256, 128), 'learning_rate': 'constant', 'momentum': 0.8, 'solver': 'adam'}

# 0.7562821677263094
# {'hidden_layer_sizes': (128, 64, 32), 'learning_rate': 'invscaling', 'momentum': 0.8}

### Training models with best hyperparameters found

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

MLA = [
    LogisticRegression(
        max_iter=500,
        penalty='l1', 
        solver='saga',
    ),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(
        C=0.75,
        gamma='auto',
        kernel='rbf',
    ),
    XGBClassifier(),
    MLPClassifier(
        activation='logistic',
        learning_rate='invscaling',
        momentum=0.8,
        hidden_layer_sizes=(128, 64, 32),
        solver='adam',
    )
]

# Setting up the table to compare the performances of each model
MLA_cols = ['Model', 'Accuracy']
MLA_compare = pd.DataFrame(columns = MLA_cols)

row_index = 0
for model in MLA:
    MLA_compare.loc[row_index, 'Model'] = model.__class__.__name__
    cv_results = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')
    MLA_compare.loc[row_index, 'Accuracy'] = cv_results.mean()
    
    row_index += 1

MLA_compare.sort_values(by=['Accuracy'], ascending=False, inplace=True)
MLA_compare

### Create ensemble using the best 3 models found

In [None]:
from sklearn.ensemble import VotingClassifier

# Best models found
mlp_optimal = MLPClassifier(
    activation='logistic',
    learning_rate='invscaling',
    momentum=0.8,
    hidden_layer_sizes=(128, 64, 32),
    solver='adam',
)
svc_optimal = SVC(
    C=0.75,
    gamma='auto',
    kernel='rbf',
)
log_reg_optimal = LogisticRegression(
    max_iter=500,
    penalty='l1', 
    solver='saga',
)

# Define ensemble
hard_ensemble = VotingClassifier(
    estimators=[
        ('MLP', mlp_optimal),
        ('SVC', svc_optimal),
        ('LogReg', log_reg_optimal)
    ],
    voting = 'hard'
)

# Return accuracy scores
hard_cross_val = cross_val_score(hard_ensemble, X_train, y_train, scoring='accuracy')
print('Hard voting ensemble score:' , hard_cross_val.mean())

## Submission

In [None]:
# Function to predict entries
def predict(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions = pd.DataFrame(
        {
            'PassengerId': test_df['PassengerId'],
            'Transported': y_pred
        }
    )
    
    # Transform to 'True'/'False'
    predictions["Transported"] = predictions["Transported"].astype(bool)
    
    return predictions

### Submit predictions using best models 

In [None]:
predict(svc_optimal).to_csv('submission_svc.csv', index=False)
predict(mlp_optimal).to_csv('submission_mlp.csv', index=False)
predict(log_reg_optimal).to_csv('submission_log_reg.csv', index=False)
predict(hard_ensemble).to_csv('submission_ensemble.csv', index=False)