In [None]:
from skopt import BayesSearchCV
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_val_score, StratifiedKFold, HalvingRandomSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB, CategoricalNB, ComplementNB, BernoulliNB
from sklearn.neighbors import  KNeighborsClassifier

from xgboost import XGBClassifier

# 0 - Load Dataset

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
train

In [None]:
train.describe()

# 1 - Cleaning Data
Identify Features with Missing Values

In [None]:
train.isna().sum()

# 2 - Feature Selection
### Test Correlation of each Feature with Target Variable

##### Pclass

In [None]:
# Create a contingency table
contingency_table = pd.crosstab(train['Pclass'], train['Survived'])

# Perform chi-square test
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Interpret the results
print(f"Chi-square value: {chi2}")
print(f"P-value: {p_value}")

# Calculate the percentage of people who survived for each distinct value of 'Embarked'
survival_percentages = contingency_table.divide(contingency_table.sum(axis=1), axis=0) * 100

# Display the result
print("\nSurvival percentages for each Pclass value:")
print(round(survival_percentages, 2))

##### Name

I am dropping the 'Name' feature for the first ML model due to the following reasons that require additional preprocessing:
- sharing a last name could convey information (spouse, relatives, etc.), but I suspect that the information is correlating with Cabin and/or Ticket
- first name is not relevant to the problem
- title (Master, Mr, Mrs, Ms) might be interesting but is correlating with sex

Steps to analyse:
1. Extract last name by splitting by ','
2. Extract title by splitting by '.'
3. Test correlation with other features and target variable


##### Sex

In [None]:
# Create a contingency table
contingency_table = pd.crosstab(train['Sex'], train['Survived'])

# Perform chi-square test
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Interpret the results
print(f"Chi-square value: {chi2}")
print(f"P-value: {p_value}")

# Calculate the percentage of people who survived for each distinct value of 'Embarked'
survival_percentages = contingency_table.divide(contingency_table.sum(axis=1), axis=0) * 100

# Display the result
print("\nSurvival percentages for each Sex value:")
print(round(survival_percentages, 2))

##### Age

In [None]:
# Select target variable and feature, ignoring rows with NaN in 'Age'
age = train.copy()
age = age.dropna(subset=['Age'])
target_variable = age['Survived']
feature = age['Age']

# Calculate Pearson correlation coefficient
correlation_coefficient, p_value = pearsonr(target_variable, feature)

# Interpret the results
print(f"Pearson Correlation Coefficient: {correlation_coefficient}")
print(f"P-value: {p_value}")

# Plot boxplots for 'Age' grouped by 'Survived'
plt.figure(figsize=(10, 6))
sns.boxplot(x='Survived', y='Age', data=age)
plt.title(f'Boxplots of Age by Survived\nPearson Correlation: {correlation_coefficient:.2f}, P-value: {p_value:.4f}')
plt.xlabel('Survived')
plt.ylabel('Age')
plt.show()

##### SibSp

In [None]:
target_variable = train['Survived']
feature = train['SibSp']

# Calculate Pearson correlation coefficient
correlation_coefficient, p_value = pearsonr(target_variable, feature)

# Interpret the results
print(f"Pearson Correlation Coefficient: {correlation_coefficient}")
print(f"P-value: {p_value}")

##### Parch

In [None]:
target_variable = train['Survived']
feature = train['Parch']

# Calculate Pearson correlation coefficient
correlation_coefficient, p_value = pearsonr(target_variable, feature)

# Interpret the results
print(f"Pearson Correlation Coefficient: {correlation_coefficient}")
print(f"P-value: {p_value}")

##### Ticket

Drop Ticket because 681 out of 981 samples have distinct values. 

##### Fare

In [None]:
target_variable = train['Survived']
feature = train['Fare']

# Calculate Pearson correlation coefficient
correlation_coefficient, p_value = pearsonr(target_variable, feature)

# Interpret the results
print(f"Pearson Correlation Coefficient: {correlation_coefficient}")
print(f"P-value: {p_value}")

# Plot boxplots for 'Age' grouped by 'Survived'
plt.figure(figsize=(10, 6))
sns.boxplot(x='Survived', y='Fare', data=train)
plt.title(f'Boxplots of Fare by Survived\nPearson Correlation: {correlation_coefficient:.2f}, P-value: {p_value:.4f}')
plt.xlabel('Survived')
plt.ylabel('Fare')
plt.ylim(0, 150)
plt.show()

##### Cabin

In [None]:
cabin = train.copy()
age = cabin.dropna(subset=['Cabin'])
target_variable = cabin['Survived']
feature = cabin['Cabin']

contingency_table = pd.crosstab(cabin['Cabin'], cabin['Survived'])

# Perform chi-square test
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Interpret the results
print(f"Chi-square value: {chi2}")
print(f"P-value: {p_value}")

##### Embarked

In [None]:
# Create a contingency table
contingency_table = pd.crosstab(train['Embarked'], train['Survived'])

# Perform chi-square test
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Interpret the results
print(f"Chi-square value: {chi2}")
print(f"P-value: {p_value}")

# Calculate the percentage of people who survived for each distinct value of 'Embarked'
survival_percentages = contingency_table.divide(contingency_table.sum(axis=1), axis=0) * 100

# Display the result
print("\nSurvival percentages for each Embarked value:")
print(round(survival_percentages, 2))

# 3 - Preprocessing

In [None]:
col_to_drop = ['Cabin', 'Ticket', 'SibSp', 'Name', 'PassengerId']
train_filtered = train.drop(columns=col_to_drop)
train_filtered

In [None]:
X_train = train_filtered.drop(columns='Survived')
y_train = train_filtered['Survived']

# Define preprocessing steps for the different types of features
numeric_features = ['Age', 'Parch', 'Fare']
categorical_features = ['Sex', 'Embarked']
ordinal_features = ['Pclass']

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))
])

ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder())
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('ord', ordinal_transformer, ordinal_features)
    ])

# 4 - Training & Evaluation

In [None]:
random_state = 42

### RandomForest

In [None]:
# Create the final pipeline with preprocessing and RandomForestClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=random_state))
])

# Define cross-validation strategy (StratifiedKFold is suitable for classification tasks)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

# Define the hyperparameters and their distributions to sample from
param_dist = {
    'classifier__n_estimators': (10, 30),
    'classifier__max_depth': (1, 8),
    'classifier__min_samples_split': (2, 10),
    'classifier__criterion': ['gini', 'entropy', 'log_loss'],
    'classifier__max_features': ['sqrt', 'log2', None]
}

random_search = HalvingRandomSearchCV(
    pipeline,
    param_distributions=param_dist,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    random_state=random_state,
    factor=4,
)

random_search.fit(X_train, y_train)

print("Best Hyperparameters:", random_search.best_params_)
print(f"Best Mean Accuracy across Folds: {random_search.best_score_:.4f}")

In [None]:
# Define the hyperparameters and their values to search over
param_grid = {
    'classifier__n_estimators': [6, 10, 16, 20, 30],
    'classifier__max_depth': [2, 4, 6, 8, 10],
    'classifier__min_samples_split': [2, 3, 4],
    #'classifier__criterion': ['gini', 'entropy', 'log_loss'],
    #'classifier__max_features': ['sqrt', 'log2', None]
}

# Create the final pipeline with preprocessing and RandomForestClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=random_state))
])

# Define cross-validation strategy (StratifiedKFold is suitable for classification tasks)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

# Create GridSearchCV object
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameter values and corresponding mean cross-validated score
print("Best Hyperparameters:", grid_search.best_params_)
print(f"Best Mean Accuracy across Folds: {grid_search.best_score_:.4f}")

In [None]:
# Define the hyperparameters and their values to search over
param_grid = {
    'classifier__n_estimators': [69],
    'classifier__max_depth': [11],
    'classifier__min_samples_split': [4],
    'classifier__criterion': ['gini'],
    'classifier__max_features': [None]
}

# Create the final pipeline with preprocessing and RandomForestClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=random_state))
])

# Define cross-validation strategy (StratifiedKFold is suitable for classification tasks)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

# Create GridSearchCV object
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameter values and corresponding mean cross-validated score
print("Best Hyperparameters:", grid_search.best_params_)
print(f"Best Mean Accuracy across Folds: {grid_search.best_score_:.4f}")

In [None]:
pd.DataFrame(grid_search.cv_results_)

In [None]:
# Define the hyperparameters and their search spaces
param_space = {
    'classifier__n_estimators': (20, 600),
    'classifier__max_depth': (1, 80),
    'classifier__min_samples_split': (2, 20),
    'classifier__criterion': ['gini', 'entropy', 'log_loss'],
}

# Create the final pipeline with preprocessing and RandomForestClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=random_state))
])

# Define cross-validation strategy (StratifiedKFold is suitable for classification tasks)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

# Create BayesSearchCV object
bayes_search = BayesSearchCV(
    pipeline,
    search_spaces=param_space,
    n_iter=64,
    cv=cv,
    scoring='accuracy',
    n_jobs=8,
    n_points=2,
    random_state=random_state,
    verbose=0,
    pre_dispatch='2*n_jobs'
)

# Fit the BayesSearchCV object to the data
bayes_search.fit(X_train, y_train)

# Print the best hyperparameter values and corresponding mean cross-validated score
print("Best Hyperparameters:", bayes_search.best_params_)
print(f"Best Mean Accuracy across Folds: {bayes_search.best_score_:.4f}")


In [None]:
pd.DataFrame(bayes_search.cv_results_)

### Logistic Regression 

In [None]:
# Define the hyperparameters and their search spaces
param_space = {
    'classifier__max_iter': (50, 10000),
    'classifier__penalty': ['l2'], # ['l1', 'l2', 'elasticnet', None]
    'classifier__tol': (1e-6, 50),
    'classifier__solver': ['newton-cholesky', 'lbfgs'], # {‘lbfgs’, ‘liblinear’, ‘newton-cg’, ‘newton-cholesky’, ‘sag’, ‘saga’}
    
}

# Create the final pipeline with preprocessing and RandomForestClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegressionCV(random_state=random_state))
])

# Define cross-validation strategy (StratifiedKFold is suitable for classification tasks)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

# Create BayesSearchCV object
bayes_search = BayesSearchCV(
    pipeline,
    search_spaces=param_space,
    n_iter=128,
    cv=cv,
    scoring='accuracy',
    n_jobs=8,
    n_points=2,
    random_state=random_state,
    verbose=0,
    pre_dispatch='2*n_jobs'
)

# Fit the BayesSearchCV object to the data
bayes_search.fit(X_train, y_train)

# Print the best hyperparameter values and corresponding mean cross-validated score
print("Best Hyperparameters:", bayes_search.best_params_)
print(f"Best Mean Accuracy across Folds: {bayes_search.best_score_:.4f}")


In [None]:
pd.DataFrame(bayes_search.cv_results_)

### GradienBoostingClassifier

In [None]:
# Define the hyperparameters and their search spaces
param_space = {
    'classifier__loss': ['log_loss', 'exponential'],
    'classifier__learning_rate': (1e-2, 3),
    'classifier__n_estimators': (10, 100),
    'classifier__subsample': (0.6, 1),
    'classifier__min_samples_leaf': (0.0001, 0.1),
    'classifier__max_depth': (2, 30)

}

# Create the final pipeline with preprocessing and RandomForestClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=random_state))
])

# Define cross-validation strategy (StratifiedKFold is suitable for classification tasks)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

# Create BayesSearchCV object
bayes_search = BayesSearchCV(
    pipeline,
    search_spaces=param_space,
    n_iter=128,
    cv=cv,
    scoring='accuracy',
    n_jobs=8,
    n_points=2,
    random_state=random_state,
    verbose=0,
    pre_dispatch='2*n_jobs'
)

# Fit the BayesSearchCV object to the data
bayes_search.fit(X_train, y_train)

# Print the best hyperparameter values and corresponding mean cross-validated score
print("Best Hyperparameters:", bayes_search.best_params_)
print(f"Best Mean Accuracy across Folds: {bayes_search.best_score_:.4f}")


In [None]:
pd.DataFrame(bayes_search.cv_results_)

### XGBoost

In [None]:
# Define the hyperparameters and their search spaces
param_space = {
    'classifier__learning_rate': (1e-3, 3),
    'classifier__max_depth': (4, 12),
    'classifier__n_estimators': (6, 24),
    #'classifier__min_split_loss': (0, 1.0),
    #'classifier__subsample': (0.5, 1),
    'classifier__lambda': (1, 100),
    'classifier__alpha': (1e-3, 100)
}

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=random_state, nthread=16))
])

# Define cross-validation strategy (StratifiedKFold is suitable for classification tasks)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

# Create BayesSearchCV object
bayes_search = BayesSearchCV(
    pipeline,
    search_spaces=param_space,
    n_iter=64,
    cv=cv,
    scoring='accuracy',
    n_jobs=8,
    n_points=2,
    random_state=random_state,
    verbose=0,
    pre_dispatch='2*n_jobs'
)

# Fit the BayesSearchCV object to the data
bayes_search.fit(X_train, y_train)

# Print the best hyperparameter values and corresponding mean cross-validated score
print("Best Hyperparameters:", bayes_search.best_params_)
print(f"Best Mean Accuracy across Folds: {bayes_search.best_score_:.4f}")


In [None]:
pd.DataFrame(bayes_search.cv_results_)

### Naive Bayes Classifier

In [None]:
# Define the hyperparameters and their values for each Naive Bayes classifier
param_grid = {
    'classifier': [GaussianNB(), BernoulliNB()]
}

# Create the final pipeline with preprocessing and Naive Bayes classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())  # Default to CategoricalNB
])

# Define cross-validation strategy (StratifiedKFold is suitable for classification tasks)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

# Create GridSearchCV object
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=8
)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameter values and corresponding mean cross-validated score
print("Best Hyperparameters:", grid_search.best_params_)
print(f"Best Mean Accuracy across Folds: {grid_search.best_score_:.4f}")

### KNearestNeighbours

In [None]:
# Define the hyperparameters and their values for each Naive Bayes classifier
param_grid = {
    'classifier__n_neighbors': [3, 4, 5, 6],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
    'classifier__p': np.geomspace(1, 3, 6),
    'classifier__leaf_size': np.geomspace(1, 50, 10, dtype=int),
}

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())  # Default to CategoricalNB
])

# Define cross-validation strategy (StratifiedKFold is suitable for classification tasks)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

# Create GridSearchCV object
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=8
)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameter values and corresponding mean cross-validated score
print("Best Hyperparameters:", grid_search.best_params_)
print(f"Best Mean Accuracy across Folds: {grid_search.best_score_:.4f}")

# Test and Submission

In [None]:
X_test = test.drop(columns=col_to_drop)

predictions = bayes_search.predict(X_test)

output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('submission_4.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
X_test = test.drop(columns=col_to_drop)

predictions = grid_search.predict(X_test)

output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('submission_5.csv', index=False)
print("Your submission was successfully saved!")