In [14]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [15]:
# Importing the training dataset from the CSV
dataset = pd.read_csv('train.csv')
dataset = dataset.drop(['PassengerId', 'Name'], axis=1)
dataset['CryoSleep'] = dataset['CryoSleep'].astype(float)
dataset['VIP'] = dataset['VIP'].astype(float)
dataset['Transported'] = dataset['Transported'].astype(float)
label_encoders = {}
# Encoding categorical columns ('HomePlanet', 'Cabin', 'Destination') to numeric values
for column in ['HomePlanet', 'Cabin', 'Destination']:
    le = LabelEncoder()
    dataset[column] = le.fit_transform(dataset[column]).astype(float)
    label_encoders[column] = le
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   float64
 1   CryoSleep     8476 non-null   float64
 2   Cabin         8693 non-null   float64
 3   Destination   8693 non-null   float64
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   float64
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Transported   8693 non-null   float64
dtypes: float64(12)
memory usage: 815.1 KB


In [16]:
X = dataset.drop('Transported', axis=1)
y = dataset['Transported']
# Splitting the dataset into training and validation sets:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Convert training and validation sets into DMatrix format, which is optimized for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
# Define hyperparameters for the XGBoost model
param = {
    'objective': 'binary:logistic',
    'tree_method': 'gpu_hist',
    'eval_metric': ['logloss']
}
num_round = 15
evallist = [(dval, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)
y_pred_val = bst.predict(dval, ntree_limit=bst.best_ntree_limit)
y_pred_binary = [1 if prob > 0.5 else 0 for prob in y_pred_val]

accuracy = accuracy_score(y_val, y_pred_binary)
print(f"Accuracy: {accuracy * 100:.2f}%")

[0]	eval-logloss:0.58963	train-logloss:0.58147
[1]	eval-logloss:0.53236	train-logloss:0.51644
[2]	eval-logloss:0.49716	train-logloss:0.47635
[3]	eval-logloss:0.47519	train-logloss:0.44746
[4]	eval-logloss:0.45923	train-logloss:0.42662
[5]	eval-logloss:0.44981	train-logloss:0.41181
[6]	eval-logloss:0.44085	train-logloss:0.40050
[7]	eval-logloss:0.43577	train-logloss:0.38783
[8]	eval-logloss:0.43246	train-logloss:0.37933
[9]	eval-logloss:0.42852	train-logloss:0.37044
[10]	eval-logloss:0.42717	train-logloss:0.36639
[11]	eval-logloss:0.42706	train-logloss:0.36307
[12]	eval-logloss:0.42515	train-logloss:0.35456
[13]	eval-logloss:0.42385	train-logloss:0.35152
[14]	eval-logloss:0.42325	train-logloss:0.34877
Accuracy: 79.99%




In [18]:
# Importing the test dataset from the CSV
testset = pd.read_csv('test.csv')
testset = testset.drop(['PassengerId', 'Name'], axis=1)
testset['CryoSleep'] = testset['CryoSleep'].astype(float)
testset['VIP'] = testset['VIP'].astype(float)
testset['Transported'] = testset['Transported'].astype(float)
label_encoders = {}
for column in ['HomePlanet', 'Cabin', 'Destination']:
    le = LabelEncoder()
    testset[column] = le.fit_transform(testset[column]).astype(float)
    label_encoders[column] = le
testset.info()
dtest = xgb.DMatrix(testset.drop('Transported', axis=1))
y_pred_test = bst.predict(dtest)
y_pred_binary = [True if prob > 0.5 else False for prob in y_pred_test]

sample_submission_df = pd.read_csv('sample_submission.csv')
sample_submission_df['Transported'] = y_pred_binary
sample_submission_df.to_csv('sample_submission.csv', index=False)
sample_submission_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4277 non-null   float64
 1   CryoSleep     4184 non-null   float64
 2   Cabin         4277 non-null   float64
 3   Destination   4277 non-null   float64
 4   Age           4186 non-null   float64
 5   VIP           4184 non-null   float64
 6   RoomService   4195 non-null   float64
 7   FoodCourt     4171 non-null   float64
 8   ShoppingMall  4179 non-null   float64
 9   Spa           4176 non-null   float64
 10  VRDeck        4197 non-null   float64
 11  Transported   0 non-null      float64
dtypes: float64(12)
memory usage: 401.1 KB


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
