# SPACE SHIP TITANIC

## 1. Library Imports

In [189]:
import numpy 
import sklearn
import pandas as pd
import xgboost as xgb

## 2. Data Loading & Separation

load the training and testing data, set 'PassengerId' as the index, and split the training data into data (X) and target (y). original 'PassengerId'  is saved for the final submission.

In [190]:
Data_full = pd.read_csv("train.csv", index_col = "PassengerId")
test_data = pd.read_csv("test.csv",index_col= "PassengerId")

ID = test_data.index # for sumbission at end

y = Data_full.Transported
X = Data_full.drop("Transported", axis=1)

## 3. Feature Engineering: Cabin

extract features from the 'Cabin' column. splits 'Cabin' into 'Deck', 'CabinNumber' and 'Side', maps the 'Deck' letters to numerical values, 'CabinNumber' is scaled in transformer, and "side" is maped to boolean. 

In [191]:
# column creation and manipulation

DeckMap = {'G':7, 'F':6, 'E':5, 'D':4, 'C':3, 'B':2, 'A':1, 'T':0}

for data_set in [X, test_data]:
    data_set[['Deck', 'CabinNumber', 'PortSide']] = data_set['Cabin'].str.split('/', expand=True)

    data_set["PortSide"]= data_set["PortSide"].map({"p":True, "S":False, })
    data_set["Deck"] = data_set["Deck"].map(DeckMap)

## Data Selection and Split

features are selected from all X data. data is split into train / test (80:20)

In [192]:
# train test split 

features = ["Age", "CryoSleep", "VIP", 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', "HomePlanet","Destination", 'Deck', 'CabinNumber', 'PortSide']
X = X[features]
X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X, y)

# Data Preprocessing Pipelines

This section sets up preprocessing for different types of features in the dataset using scikit-learn Pipelines and a ColumnTransformer. Each type of column has a tailored transformation:

- **Spending columns** (`RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck`):  
  - Missing values are filled with 0
  - Values are scaled with StandardScaler

- **Numerical columns** (`Age`, `CabinNumber`):  
  - Missing values are filled with the **mean**.  
  - scaled using `StandardScaler`.

- **Boolean columns** (`CryoSleep`, `VIP`, `PortSide`):  
  - Missing values filled with the **most frequent value** (mode).  
  - not one-hot encoded.

- **Categorical columns** (`Destination`, `HomePlanet`):  
  - Missing values filled with the **most frequent** value.  
  - one-hot encoded.

- **Deck column** (`Deck`):  
  - Missing values filled with the median deck.  
  - scaled using `StandardScaler`.

Finally, all these transformations are combined in a **`ColumnTransformer`** (`preprocessor`) so that each type of feature is automatically processed when passed through the pipeline.


In [193]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

spending_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
numericall_cols = ["Age", "CabinNumber"]
bool_cols = ["CryoSleep", "VIP", "PortSide"]           # True/False mode imputation only
catgorial_cols = ["Destination", "HomePlanet"]
Median_cols = ["Deck"]

spending_transformer = Pipeline([   # ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

numericall_transformer = Pipeline([        # ["Age", "CabinNumber"]
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

boolean_transformer = Pipeline([    #["CryoSleep", "VIP", "PortSide"]     
    ('imputer', SimpleImputer(strategy='most_frequent'))    # no one_hot_encode (try one hot encode later)
])

categorical_transformer = Pipeline([    # ["Destination", "HomePlanet"]
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

median_transformer = Pipeline([    # ["Deck"]
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('spending', spending_transformer, spending_cols),
    ('age', numericall_transformer, numericall_cols),
    ('bool', boolean_transformer, bool_cols),
    ('cat', categorical_transformer, catgorial_cols),
    ('median', median_transformer, Median_cols)
])

In [None]:
# train model
Model = Pipeline([
    ('scaler', sklearn.preprocessing.StandardScaler()),
    ('model', xgb.XGBClassifier(
        n_estimators= 300,
        learning_rate=0.15,          # 300 0.15, 2   
        max_depth=2,
        random_state=42,
        eval_metric='logloss'
    ))
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Model)
])

pipeline.fit(X_train, y_train)
y_preds = pipeline.predict(X_valid)
print("MAE:")
print(sklearn.metrics.mean_absolute_error(y_valid, y_preds) * 100)

# runs 
# 1 : 31.4543
# 2: 28.408
# 3: 27.7449
# 4: 27.9839
# 5: 21.3891
# 6: 20.5151
# 7: 19.50321
# 8: 17.7552    SUMBIT: 0.7977
# 9: 19.8712    SUMBIT: 0.7928

MAE:
19.871205151793927


In [174]:
# optimize model params
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__model__n_estimators': [50, 100, 150, 200, 250, 350],
    'model__model__learning_rate': [0.09, 0.11, 0.13, 0.15, 0.17, 0.2],
    'model__model__max_depth': [2, 3, 4],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy')  # use accuracy for classification
grid_search.fit(X, y)

print("bbest params:", grid_search.best_params_)
print("best accuracy:", grid_search.best_score_)


bbest params: {'model__model__learning_rate': 0.09, 'model__model__max_depth': 2, 'model__model__n_estimators': 200}
best accuracy: 0.7933989922703885


In [195]:
# train Model on all data
pipeline.fit(X, y)
test_preds = pipeline.predict(test_data)

# generate sumbision
test_data["PassengerId"] = id
test_preds = test_preds.astype(bool)

output = pd.DataFrame({"PassengerId" : ID, "Transported": test_preds})
sumbission_file_name = f"submission_{pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
output.to_csv(sumbission_file_name, index=False)


