In [31]:
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sketch

from fasteda import fast_eda
from math import sqrt
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from ydata_profiling import ProfileReport

In [32]:
data = pd.read_csv('kaggle/input/spaceship-titanic/train.csv')
# profile = ProfileReport(data, title="Spaceship Titanic Profiling Report", explorative=True, dark_mode=True)
#profile
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## Impute data manually

In [41]:
def inpute_data(df):
    
    ## if the passenter has been in CryoSleep, then they have not used any of the services
    cols_to_update = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df.loc[df['CryoSleep'] == True, cols_to_update] = 0
    
    ## sum all of the passengers expenses
    df['TotalExpenses'] = df[cols_to_update].sum(axis=1)
    df['TotalExpenses'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    
    ## split the passenger id into group and member id
    df['GroupID'] = df['PassengerId'].str.split('_').str[0]
    df['MemberID'] = df['PassengerId'].str.split('_').str[1]
    
    #split the name into first and last name
    df['Name'] = df['Name'].astype(str)
    #df['FName'] = df['Name'].apply(lambda x: x.split(' ')[0] if len(x.split(' ')) > 0 else None)
    df['LName'] = df['Name'].apply(lambda x: x.split(' ')[1] if len(x.split(' ')) > 1 else None)
    
    ## split the cabin colum into deck, cabin number, and cabin side columns
    df['Deck'] = df['Cabin'].str.split('/').str[0]
    df['CabinNumber'] = df['Cabin'].str.split('/').str[1]
    df['CabinSide'] = df['Cabin'].str.split('/').str[2]
    
    ## use the group id to see if the passenger is traveling solo
    group_counts = df['GroupID'].value_counts()
    df['Solo'] = df['GroupID'].map(group_counts) == 1
    
    ## drop the columns we no longer need
    df = df.drop(['Cabin', 'PassengerId', 'Name'], axis=1)
    
    return df

## Impute reminaing empty data with KNNImputer



In [34]:
def knnimpute_data(df):
    scaler = StandardScaler()

    # Scale the entire dataset
    data_scaled = scaler.fit_transform(df)

    # Initialize the imputer
    imputer = KNNImputer(n_neighbors=5)

    # Fit the imputer on the scaled data (just the fitting, not transforming)
    imputer.fit(data_scaled)

    chunk_size = 1000
    num_chunks = int(np.ceil(df.shape[0] / chunk_size))

    imputed_data_list = []

    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = (i + 1) * chunk_size
        
        # Transform the chunk based on the previously fitted imputer
        imputed_chunk = imputer.transform(data_scaled[start_idx:end_idx])
        imputed_data_list.append(imputed_chunk)
        
        # Print progress
        print(f"Imputed chunk {i + 1} of {num_chunks}")

    # Combine all chunks
    imputed = np.vstack(imputed_data_list)

    # If you need to revert the scaling to original scale after imputation
    imputed_original_scale = scaler.inverse_transform(imputed)

    # Convert back to DataFrame 
    return pd.DataFrame(imputed_original_scale, columns=df.columns)

In [35]:
## manually impute what we can
data = inpute_data(data)

encoder_dict = {}

for col in ['HomePlanet', 'Destination', 'LName', 'Deck', 'CabinSide']:
    encoder = LabelEncoder()
    data[col] = encoder.fit_transform(data[col])
    encoder_dict[col] = encoder
    

## use knn to impute the remainder
data = knnimpute_data(data)
data.isna().sum()
data = data.astype(int)
print(data.dtypes)

Imputed chunk 1 of 9
Imputed chunk 2 of 9
Imputed chunk 3 of 9
Imputed chunk 4 of 9
Imputed chunk 5 of 9
Imputed chunk 6 of 9
Imputed chunk 7 of 9
Imputed chunk 8 of 9
Imputed chunk 9 of 9
HomePlanet       int64
CryoSleep        int64
Destination      int64
Age              int64
VIP              int64
RoomService      int64
FoodCourt        int64
ShoppingMall     int64
Spa              int64
VRDeck           int64
Transported      int64
TotalExpenses    int64
GroupID          int64
MemberID         int64
LName            int64
Deck             int64
CabinNumber      int64
CabinSide        int64
Solo             int64
dtype: object


In [36]:
data.head(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,TotalExpenses,GroupID,MemberID,LName,Deck,CabinNumber,CabinSide,Solo
0,1,0,2,39,0,0,0,0,0,0,0,0,1,1,1431,1,0,0,1
1,0,0,2,24,0,109,9,25,549,44,1,736,2,1,2109,5,0,1,1
2,1,0,2,58,1,43,3576,0,6715,49,0,10383,3,1,1990,0,0,1,0
3,1,0,2,33,0,0,1283,371,3329,193,0,5176,3,2,1990,0,0,1,0
4,0,0,2,16,0,303,70,151,565,2,1,1091,4,1,1778,5,1,1,1


In [37]:
data.columns

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported',
       'TotalExpenses', 'GroupID', 'MemberID', 'LName', 'Deck', 'CabinNumber',
       'CabinSide', 'Solo'],
      dtype='object')

## Create train and test sets

In [38]:
X = data.drop('Transported', axis=1)
y = data['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=20)

In [39]:
xgboost_model = XGBClassifier(objective='reg:squarederror')
xgboost_model.fit(X_train, y_train)

y_pred_xgboost = xgboost_model.predict(X_test)

as_xgboost = accuracy_score(y_test, y_pred_xgboost)

print(f"Accuracy Score: {as_xgboost}")

Accuracy Score: 0.7942528735632184


In [40]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

def objective(space):
    clf = XGBClassifier(
        objective='reg:squarederror',
        n_estimators=int(space['n_estimators']),
        max_depth=int(space['max_depth']),
        learning_rate=space['learning_rate'],
        gamma=space['gamma'],
        min_child_weight=space['min_child_weight'],
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree']
    )

    clf.fit(X_train, y_train)

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    # We aim to maximize accuracy, therefore we return its negative value as loss
    return {'loss': -accuracy, 'status': STATUS_OK}

space = {
    'n_estimators': hp.quniform('n_estimators', 50, 1000, 1),
    'max_depth': hp.quniform('max_depth', 3, 15, 1),
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'gamma': hp.uniform('gamma', 0, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
}

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)
print(f"Best parameters: {best}")


 15%|█▌        | 15/100 [00:43<04:05,  2.89s/trial, best loss: -0.8218390804597702]


KeyboardInterrupt: 

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Create and train CatBoost model
catboost_model = CatBoostClassifier(verbose=0)  # verbose=0 suppresses the output during training
catboost_model.fit(X_train, y_train)

# Predict on test set
y_pred_catboost = catboost_model.predict(X_test)

# Calculate accuracy
as_catboost = accuracy_score(y_test, y_pred_catboost)

print(f"Accuracy Score: {as_catboost}")

In [None]:
import catboost
from catboost import Pool, CatBoostClassifier

# Sample data
train_data = Pool(data=X_train, label=y_train)

# Define the model
model = CatBoostClassifier(iterations=500)

# Specify hyperparameters and their possible values
grid = {
    'learning_rate': [0.01, 0.1],
    'depth': [3,4],
    'l2_leaf_reg': [9, 11],
    'iterations': [500],
    'random_strength': [0, 1],
    'bagging_temperature': [0, 1],
    'boosting_type': ['Ordered', 'Plain'],
    'bootstrap_type': ['Bayesian', 'Bernoulli'],
}


# Grid search
result = model.grid_search(grid, 
                           train_data, 
                           plot=True, # This will plot the results (optional)
                           verbose=False, # Set to True if you want to see the progress
                           partition_random_seed=0)

# You can print or save the results if you want
print(result['params'])
print(result['cv_results'])

In [None]:
opt_catboost_model = CatBoostClassifier(**result['params'], verbose=0)  # verbose=0 suppresses the output during training
opt_catboost_model.fit(X_train, y_train)

# Predict on test set
y_pred_opt_catboost = opt_catboost_model.predict(X_test)

# Calculate accuracy
as_opt_catboost = accuracy_score(y_test, y_pred_opt_catboost)

print(f"Accuracy Score: {as_opt_catboost}")

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from catboost import CatBoostClassifier, cv, Pool
from sklearn.metrics import accuracy_score

def objective(params):
    model = CatBoostClassifier(
        learning_rate=params['learning_rate'],
        depth=int(params['depth']),
        l2_leaf_reg=params['l2_leaf_reg'],
        iterations=500,  # You can adjust this
        eval_metric='Accuracy',
        logging_level='Silent'
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(acc)
    return {'loss': -acc, 'status': STATUS_OK}

space = {
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'depth': hp.quniform('depth', 4, 10, 1),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 0, 10),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,  # Adjust based on how many iterations you want
            trials=Trials())

print("Best parameters:", best)

In [None]:
hyperopt_catboost_model = CatBoostClassifier(**best, verbose=0)  # verbose=0 suppresses the output during training
hyperopt_catboost_model.fit(X_train, y_train)

# Predict on test set
y_pred_hyperopt_catboost = hyperopt_catboost_model.predict(X_test)

# Calculate accuracy
as_hyperopt_catboost = accuracy_score(y_test, y_pred_hyperopt_catboost)

print(f"Accuracy Score: {as_hyperopt_catboost}")

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.metrics import accuracy_score

base_classifier = ExtraTreesClassifier(random_state=0)
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=150, max_samples=0.8, random_state=0, n_jobs=-1)
bagging_classifier.fit(X_train, y_train)


y_pred = bagging_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

base_classifier = KNeighborsClassifier(n_neighbors=20)
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=150, max_samples=0.8, random_state=0)

bagging_classifier.fit(X_train, y_train)

y_pred = bagging_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
from sklearn.ensemble import BaggingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Initialize CatBoost and Bagging classifier
base_classifier = CatBoostClassifier(**best, verbose=0)  # verbose=0 suppresses the training output
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=50, max_samples=0.8, random_state=0)

# Train the Bagging classifier
bagging_classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = bagging_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
raw_test_data = pd.read_csv('kaggle/input/spaceship-titanic/test.csv')
test_data = raw_test_data.copy()
print(raw_test_data.columns)

## manually impute what we can
test_data = inpute_data(test_data)


encoder_dict = {}

for col in ['HomePlanet', 'Destination', 'LName', 'Deck', 'CabinSide']:
    encoder = LabelEncoder()
    test_data[col] = encoder.fit_transform(test_data[col])
    encoder_dict[col] = encoder

## use knn to impute the remainder
test_data = knnimpute_data(test_data)
test_data.isna().sum()

In [None]:
if(as_catboost > as_xgboost):
    print("Using CatBoost")
    y_pred = catboost_model.predict(test_data)
else:
    print("Using XGBoost")
    y_pred = xgboost_model.predict(test_data)
print(y_pred)