<a href="https://www.kaggle.com/code/vasilistimoudas/spaceship-titanic?scriptVersionId=145121499" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Introduction

This is a Kaggle competition, and it's available at https://www.kaggle.com/competitions/spaceship-titanic

I live in the year 2912 and I have to solve a cosmic mystery. The Spaceship Titanic was an interstellar passenger liner and launched a month ago with almost 13,000 passengers on board. The vessel set out on it's maiden voyage transporting emigrants from our solar system to three newly habitable exoplanets orbiting nearby stars. The Spaceship Titanic collided with a spacetime anomaly hidden within a dust cloud and almost half of the passengers were transported to an alternate dimension. The goal of this competition is to predict which passengers were transported from the spaceship's damaged computer system.

In this competition, I have a binary classification problem, and I will use several machine learning algorithms to predict which passengers were transported. In the end, I will evaluate each algorithm and select the one with the highest prediction accuracy.

****<span style="font-size:16px;"> Vasileios Panagiotis Timoudas****
<br>Github: https://github.com/vasilis-timoudas
<br>Linkedin: https://www.linkedin.com/in/vasileios-timoudas 

# Import Libraries and Load Data

In [None]:
# Import libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

In [None]:
# Import train and test data
train_data_full = pd.read_csv('../input/spaceship-titanic/train.csv')
test_data_full = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
# Make copy to avoid changing original data 
train_data = train_data_full.copy()
test_data = test_data_full.copy()

# Data General Overview

In [None]:
# Print the shape of the train and test data
print('Train set shape:', train_data.shape)
print('Test set shape:', test_data.shape)

In [None]:
# Print the head of the train data
train_data.head()

In [None]:
# Print basic information of the train data
train_data.info()

In [None]:
# Print the summary statistics for the numerical columns in the train data
train_data.describe().T

In [None]:
# Print the the number of the unique values in the train data
train_data.nunique()

In [None]:
# Print the the number of the objects unique values in the train data
train_data.select_dtypes(include='object').nunique()

In [None]:
# Function to find columns with missing values
def find_missing_values(df):
    # Columns with missing values
    na_cols = df.columns[df.isna().any()].tolist()

    # Missing values summary
    mv_summary = pd.DataFrame(df[na_cols].isna().sum(), columns=['number_missing'])
    mv_summary['percentage_missing'] = np.round(100 * mv_summary['number_missing'] / len(df), 2)
    
    # Data types of columns
    mv_summary['column_type'] = df[na_cols].dtypes
    
    # Sort by column type
    mv_summary = mv_summary.sort_values(by=['column_type'])
    
    return mv_summary

In [None]:
find_missing_values(train_data)

In [None]:
find_missing_values(test_data)

In [None]:
# Print the number of duplicates in training and test data
print(f'Duplicates in train set: {train_data.duplicated().sum()}, ({np.round(100 * train_data.duplicated().sum() / len(train_data), 1)}%)')
print(f'Duplicates in test set: {test_data.duplicated().sum()}, ({np.round(100 * test_data.duplicated().sum() / len(test_data), 1)}%)')

In [None]:
# Plot the histogram of the train data
train_data.hist(figsize=(15, 8))

In [None]:
# Plot the correlation map of the train data
sns.heatmap(train_data.corr(numeric_only=True), annot=True, cmap='YlGnBu', linewidths=0.5, fmt='.2f')

# EDA

In [None]:
# Create a figure
plt.figure(figsize=(5, 5))

# Plot pie chart
train_data['Transported'].value_counts().plot.pie(
    startangle=0,
    explode=[0.02, 0.02],
    autopct='%1.1f%%',
).set_title("Transported", fontweight='bold')

plt.show()   

In [None]:
# The features that I will plot
expense_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Create a figure
fig = plt.figure(figsize=(10, 20))

# For each feature I will make 2 subplots
for i, exp_feature in enumerate(expense_features):
    # Left subplot
    # Create a subplot
    ax = fig.add_subplot(5, 2, 2*i+1)
    # Plot histogram and set title
    sns.histplot(data=train_data, x=exp_feature, axes=ax, bins=30, kde=False, hue='Transported').set_title(exp_feature, fontweight='bold')
    # Set x and y margins to 0
    ax.margins(x=0)
    ax.margins(y=0)

    # Right subplot
    # Create a subplot
    ax = fig.add_subplot(5, 2, 2*i+2)
    # Plot histogram and set title
    sns.histplot(data=train_data, x=exp_feature, axes=ax, bins=30, kde=True, hue='Transported').set_title(exp_feature, fontweight='bold')
    # Set y-axis limit
    plt.ylim([0, 100])
    # Set x and y margins to 0
    ax.margins(x=0)
    ax.margins(y=0)

fig.tight_layout()
plt.show()   

In [None]:
# Create a figure
plt.figure(figsize=(10, 4))

# Plot histogram
sns.histplot(data=train_data, x='Age', hue='Transported', binwidth=1, kde=True)

# Set title
plt.title('Age Distribution', fontweight='bold')

# Set x-axis label
plt.xlabel('Age (years)')

# Set x-axis limit
plt.xlim([0, 80])

# Set x-axis margin
plt.margins(x=0)

plt.tight_layout()
plt.show()  

In [None]:
# Categorical features
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

# Create a figure
fig = plt.figure(figsize=(10, 16))

# For each feature I will make a subplot
for i, cat_feature in enumerate(categorical_features):
    # Create subplot
    ax = fig.add_subplot(4, 1, i+1)
    # Plot countplot and set title
    sns.countplot(data=train_data, x=cat_feature, axes=ax, hue='Transported').set_title(cat_feature, fontweight='bold')
    
fig.tight_layout() 
plt.show()

# Data Preprocessing

In [None]:
# Split the column PassengerId into two seperate columns gggg and pp
train_data[['gggg', 'pp']] = train_data['PassengerId'].str.split('_', expand=True)
test_data[['gggg', 'pp']] = test_data['PassengerId'].str.split('_', expand=True)

# Converts the values of the columns gggg and pp to float
train_data['gggg'] = train_data['gggg'].astype(float)
test_data['gggg'] = test_data['gggg'].astype(float)
train_data['pp'] = train_data['pp'].astype(float)
test_data['pp'] = test_data['pp'].astype(float)

In [None]:
# Split the column Cabin into three seperate columns deck, num and side
train_data[['deck', 'num', 'side']] = train_data['Cabin'].str.split('/', expand=True)
test_data[['deck', 'num', 'side']] = test_data['Cabin'].str.split('/', expand=True)

# Converts the values of the column num to float
train_data['num'] = train_data['num'].astype(float)
test_data['num'] = test_data['num'].astype(float)

In [None]:
# Sum the values of columns and add them to column Expenditure
train_data['Expenditure'] = train_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test_data['Expenditure'] = test_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

# Converts the values of the column Expenditure to float
train_data['Expenditure'] = train_data['Expenditure'].astype(float)
test_data['Expenditure'] = test_data['Expenditure'].astype(float)

In [None]:
# Select numerical and categorical cols
numerical_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in train_data.columns if train_data[cname].nunique() < 10 and  train_data[cname].dtype == 'object']

# Keep the selected columns only
my_cols = numerical_cols + categorical_cols
train_data = train_data[my_cols].join(train_data.Transported)
test_data = test_data[my_cols]

In [None]:
# I decided to don't use the feature Expenditure
numerical_cols = [item for item in numerical_cols if item != 'Expenditure']
train_data = train_data.drop('Expenditure', axis=1)
test_data = test_data.drop('Expenditure', axis=1)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()) 
])
    
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) 
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
# Separate target from predictors
X = train_data.drop('Transported', axis=1)
y = train_data.Transported.astype(int)

# Train-validation split
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

# Model Selection

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# The classifiers that I will use
classifiers = {
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(solver='liblinear'),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'Gaussian Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(iterations=100, verbose=0) 
}

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Function that finds and returns the accuracy of each classifier
def find_classifiers_accuracy(classifiers=classifiers, 
                              preprocessor=preprocessor, 
                              X_train=X_train, 
                              X_valid=X_valid, 
                              y_train=y_train, 
                              y_valid=y_valid, 
                              X=X, 
                              y=y, 
                              is_cross_val=False):
    
    accuracy_name = 'Accuracy' if is_cross_val == False else 'Mean Accuracy'
    results = {'Classifier': [], accuracy_name: []}

    # For each classifier find the accuracy
    for name, clf in classifiers.items():
        # Bundle preprocessing and modeling code in a pipeline
        my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', clf)
                             ])
        
        if is_cross_val == False:
            # Preprocessing of training data and fit the classifier 
            my_pipeline.fit(X_train, y_train)

            # Preprocessing of validation data and get predictions
            y_pred = my_pipeline.predict(X_valid)

            # Calculate accuracy
            accuracy = accuracy_score(y_pred, y_valid)
            
            # Save results
            results['Classifier'].append(name)
            results[accuracy_name].append(accuracy)
        else:
            # 5-fold cross validation
            stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

            scores = cross_val_score(my_pipeline, X, y, cv=stratified_kfold) 
            
            # Calculate mean accuracy
            mean_accuracy = scores.mean()
            
            # Save results
            results['Classifier'].append(name)
            results[accuracy_name].append(mean_accuracy)
    
    # Create a DataFrame from the results dictionary
    results_df = pd.DataFrame(results)
    
    # Sort DataFrame by accuracy_name
    results_df = results_df.sort_values(by=accuracy_name, ascending=False)
    
    # Reset DataFrame index 
    results_df.reset_index(drop=True, inplace=True)
        
    return results_df

In [None]:
tts_classifiers_accuracy_df = find_classifiers_accuracy(is_cross_val=False)
tts_classifiers_accuracy_df

In [None]:
cv_classifiers_accuracy_df = find_classifiers_accuracy(is_cross_val=True)
cv_classifiers_accuracy_df

In [None]:
tts_best_classifier_name = tts_classifiers_accuracy_df['Classifier'].iloc[0]
tts_best_classifier_accuracy = tts_classifiers_accuracy_df['Accuracy'].iloc[0]

cv_best_classifier_name = cv_classifiers_accuracy_df['Classifier'].iloc[0]
cv_best_classifier_accuracy = cv_classifiers_accuracy_df['Mean Accuracy'].iloc[0]


if tts_best_classifier_accuracy > cv_best_classifier_accuracy:
    best_classifier = classifiers[tts_best_classifier_name]
else:
    best_classifier = classifiers[cv_best_classifier_name]

# Model Training

In [None]:
# Get the best pipeline
best_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', best_classifier)
                             ])

# Get the predictions of the test data 
y_pred = best_pipeline.predict(test_data)

# Submission

In [None]:
# Read sample submission file
submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

# Add predictions
submission['Transported'] = y_pred

# Replace 0 to False and 1 to True
submission = submission.replace({0:False, 1:True})

# Output to submission file
submission.to_csv('submission.csv', index=False)