In [15]:
import pandas as pd
import os
import torch
import mlflow
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import random
from torch.utils.data import DataLoader
from skopt import BayesSearchCV
from skorch import NeuralNetClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from prepare_data import clean_df
from xgboost  import XGBClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score

In [16]:
# fix seed for reproducibility
SEED = 0
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [17]:
# Load data
# Train
DATA_PATH = 'data'
train_data_path = os.path.join(DATA_PATH,'train.csv')
train_df = pd.read_csv(train_data_path, sep=',')

# Test
test_data_path = os.path.join(DATA_PATH,'test.csv')
test_df = pd.read_csv(test_data_path, sep=',')

# load predicted features
# Cabin letters
cabin_chars_path = 'filled_null_cabin_chars.csv'
cabin_chars = pd.read_csv(cabin_chars_path, sep=',')
cabin_chars['value'] = (cabin_chars['value']).astype(str)
cabin_chars.set_index('PassengerId',inplace=True)

# Cabin numbers
cabin_numbers_path = 'filled_null_cabin_numbers.csv'
cabin_numbers = pd.read_csv(cabin_numbers_path, sep=',')
cabin_numbers['value'] = (cabin_numbers['value']).astype(str)
cabin_numbers.set_index('PassengerId',inplace=True)

# Age
age_value_path = 'filled_null_ages.csv'
age_value_chars = pd.read_csv(age_value_path, sep=',')
age_value_chars['value'] = (age_value_chars['value']).astype(float)
age_value_chars.set_index('PassengerId',inplace=True)

#fill null values with predicted values
train_df.loc[cabin_chars.index, 'Cabin'] = cabin_chars['value'] + cabin_numbers['value']
train_df.loc[age_value_chars.index, 'Age'] = age_value_chars['value']

# Preprocess data
train_df = clean_df(train_df)

print(train_df.head())
print(train_df.info())

# Change df to np
train_df = train_df.to_numpy()
test_df = test_df.to_numpy()

# Split data
train_set, val_set = torch.utils.data.random_split(train_df, [700, 191]) # to do: split before preprocessing

# Change np to array | to avoid slice errors
train_set = np.array(train_set)
val_set = np.array(val_set)
X_train,y_train = train_set[:,:-1],train_set[:,-1]
X_test,y_test = val_set[:,:-1],val_set[:,-1]

# Create dataloaders
batach_size = 16
train_dataloader = DataLoader(dataset=train_set, batch_size=batach_size, shuffle=True) 
val_dataloader = DataLoader(dataset=val_set, batch_size=batach_size, shuffle=False)
test_dataloader = DataLoader(dataset=test_df, batch_size=batach_size, shuffle=False)

   Pclass  Sex       Age  SibSp  Parch    Ticket      Fare  Cabin_char_id  \
0     1.0  0.0  0.271174  0.125    0.0  0.006826  0.014151           0.75   
1     0.0  1.0  0.472229  0.125    0.0  0.005675  0.139136           0.25   
2     1.0  1.0  0.321438  0.000    0.0  0.999989  0.015469           0.50   
3     0.0  1.0  0.434531  0.125    0.0  0.036695  0.103644           0.25   
4     1.0  0.0  0.434531  0.000    0.0  0.120417  0.015713           0.75   

   Cabin_number  Embarked_C  Embarked_Q  Embarked_S  Survived  
0      0.439189         0.0         0.0         1.0       0.0  
1      0.574324         1.0         0.0         0.0       1.0  
2      0.621622         0.0         0.0         1.0       1.0  
3      0.831081         0.0         0.0         1.0       1.0  
4      0.304054         0.0         0.0         1.0       0.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
--- 

In [18]:
# Convert PyTorch DataLoader data to NumPy arrays for XGBoost
X_train_np = X_train
y_train_np = y_train

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=3, random_state=42)

# Train the XGBoost model
xgb_model.fit(X_train_np, y_train_np)

In [19]:
# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.92
Confusion Matrix:
[[103   8]
 [  8  72]]


In [20]:
# Define param space
param_space = {
    'learning_rate': (1e-3, 1e-1, 'log-uniform'),
    'n_estimators': [100,120,140,160,180,200],
    'max_depth': [1,2,3,4,5,6,7],
    'device': ['cuda'],

}

# Initialize model
xgb_model = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=3, device='cuda')

# Initialize Bayesian Optimization
opt = BayesSearchCV(
    xgb_model,
    param_space,
    n_iter=64,
    cv=5
)

In [21]:
# train model
opt.fit(X_train, y_train)
opt.best_params_

OrderedDict([('device', 'cuda'),
             ('learning_rate', 0.02191987674009792),
             ('max_depth', 4),
             ('n_estimators', 160)])

In [22]:
# train model with hyperparameters from BayesSearch 
xgb_model = XGBClassifier(learning_rate=opt.best_params_['learning_rate'], n_estimators=opt.best_params_['n_estimators'], max_depth=opt.best_params_['max_depth'], random_state=42)
# cross validation
scores = cross_val_score(xgb_model, X_train, y_train, cv = 4, scoring='accuracy')
scores

array([0.94857143, 0.94857143, 0.91428571, 0.93142857])

In [23]:
# train model with hyperparameters from BayesSearch 
xgb_model = XGBClassifier(learning_rate=opt.best_params_['learning_rate'], n_estimators=opt.best_params_['n_estimators'], max_depth=opt.best_params_['max_depth'], random_state=42)
xgb_model.fit(X_train, y_train)

In [24]:
# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.92
Confusion Matrix:
[[102   9]
 [  6  74]]


Accuracy: 0.82
[[111  22]
 [ 12  46]]

In [25]:
# Load data
DATA_PATH = 'data'
test_data_path = os.path.join(DATA_PATH,'test.csv')
test_df = pd.read_csv(test_data_path, sep=',')

# load predicted features
# Cabin letters
cabin_chars_path = 'filled_null_cabin_chars_test.csv'
cabin_chars = pd.read_csv(cabin_chars_path, sep=',')
# change type to str
cabin_chars['value'] = (cabin_chars['value']).astype(str)
# set index
cabin_chars.set_index('PassengerId',inplace=True)

# Cabin numbers
cabin_numbers_path = 'filled_null_cabin_numbers_test.csv'
cabin_numbers = pd.read_csv(cabin_numbers_path, sep=',')
cabin_numbers['value'] = (cabin_numbers['value']).astype(str)
cabin_numbers.set_index('PassengerId',inplace=True)

# Age
age_value_path = 'filled_null_ages_test.csv'
age_value_chars = pd.read_csv(age_value_path, sep=',')
age_value_chars['value'] = (age_value_chars['value']).astype(float)
age_value_chars.set_index('PassengerId',inplace=True)

#fill null values
test_df.loc[cabin_chars.index, 'Cabin'] = cabin_chars['value'] + cabin_numbers['value']
test_df.loc[age_value_chars.index, 'Age'] = age_value_chars['value']

# Preprocess data
test_df = clean_df(test_df, test=True)

print(test_df.head())
print(test_df.info())

# change df to np
test_df = test_df.to_numpy()
test_set = np.array(test_df)

   Pclass  Sex       Age  SibSp     Parch    Ticket      Fare  Cabin_char_id  \
0     1.0  0.0  0.452723  0.000  0.000000  0.106700  0.015282       0.333333   
1     1.0  1.0  0.617566  0.125  0.000000  0.117134  0.013663       0.833333   
2     0.5  0.0  0.815377  0.000  0.000000  0.077475  0.018909       0.333333   
3     1.0  0.0  0.353818  0.000  0.000000  0.101619  0.016908       0.833333   
4     1.0  1.0  0.287881  0.125  0.111111  0.999995  0.023984       0.833333   

   Cabin_number  Embarked_C  Embarked_Q  Embarked_S  
0      0.174242         0.0         1.0         0.0  
1      0.325758         0.0         0.0         1.0  
2      0.303030         0.0         1.0         0.0  
3      0.166667         0.0         0.0         1.0  
4      0.204545         0.0         0.0         1.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0

In [26]:
# make prediction
pred = xgb_model.predict(test_set)
outputs = pred.tolist()

# load test.csv
test_data_path = os.path.join(DATA_PATH,'test.csv')
test_df = pd.read_csv(test_data_path, sep=',')

# create answear df
answear = pd.DataFrame()
answear['PassengerId'] = test_df['PassengerId']
answear['Survived'] = outputs

In [27]:
answear

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [28]:
# save prediction to file
answear.to_csv('answear.csv', index=False)