In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

X = train_data.drop(['Transported', 'PassengerId', 'Name', 'Cabin'], axis=1)
y = train_data['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


##### So what do we need to do?
##### We need an imputer for missing values in numerical collumns
##### We need to remove the collums that do dont matter i.e name ,transported
##### one hot encode 
##### put this all into a data pipeline

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
import pandas as pd

class Custom_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, feature_names=None):
        return feature_names

class Numerical_Imputer(Custom_Transformer):
    def transform(self,X,y=None):
        Imputer = SimpleImputer(strategy='median')
        Numerical = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
        X[Numerical] = Imputer.fit_transform(X[Numerical])
        return X

class Binary_converter(Custom_Transformer):
    def transform(self,X,y=None):
        binary_collums = ['CryoSleep']
        X[binary_collums] = X[binary_collums].applymap(lambda x: 1 if x == True else 0)
        return X

class OneHotEncode(Custom_Transformer):
    def transform(self,X,y=None):
        Ohe = OneHotEncoder(sparse_output=False)
        Categorical_columns = ['HomePlanet','Destination','VIP']
        X_encoded = Ohe.fit_transform(X[Categorical_columns])
        X_encoded_df = pd.DataFrame(X_encoded, columns=Ohe.get_feature_names_out(Categorical_columns),index=X.index)
        X = X.drop(columns=Categorical_columns)
        X = pd.concat([X,X_encoded_df],axis=1)
        return X

pipeline = Pipeline(steps=[
    ('numerical_imputer', Numerical_Imputer()),
    ('binary_converter', Binary_converter()),
    ('one_hot_encoder', OneHotEncode())
])

pipeline.fit(X_train, y_train)

xgb = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

X_transformed = pipeline.fit_transform(X_train)
xgb.fit(X_transformed, y_train)
y_pred = xgb.predict(pipeline.transform(X_test))
y_pred_proba = xgb.predict_proba(pipeline.transform(X_test))[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

  X[binary_collums] = X[binary_collums].applymap(lambda x: 1 if x == True else 0)
  X[binary_collums] = X[binary_collums].applymap(lambda x: 1 if x == True else 0)


ROC AUC Score: 0.8714


  X[binary_collums] = X[binary_collums].applymap(lambda x: 1 if x == True else 0)
  X[binary_collums] = X[binary_collums].applymap(lambda x: 1 if x == True else 0)


In [7]:
print(f"ROC AUC Score: {roc_auc:.4f}")

ROC AUC Score: 0.8714
