### Data sets were acquired from https://www.kaggle.com/c/titanic

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

df_train = pd.read_csv('train.csv', index_col=['PassengerId'])
df_test = pd.read_csv('test.csv', index_col=['PassengerId'])

In [None]:
# Have a look at the first five rows
df_train.head()

In [None]:
# Show missing value ratios of features
df_train.isna().sum() / len(df_train)

In [None]:
def drop_irrelevant(df):
    return df.drop(columns=['Name', 'Ticket', 'Cabin'])
df_train = drop_irrelevant(df_train)
X_test = drop_irrelevant(df_test)

In [None]:
# Get X_train and y_train
y = df_train.Survived.copy()
X = df_train.drop(columns='Survived').copy()

In [None]:
# Retrive num_cols and cat_cols
num_cols = list(X._get_numeric_data().columns)
cat_cols = list(set(X.columns) - set(num_cols))

In [None]:
# Show missing value ratios of features
df_train.isna().sum() / len(df_train)

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector

cat_pipe = make_pipeline(
            (SimpleImputer(strategy='most_frequent')),
            (OneHotEncoder(drop='first', handle_unknown='error'))
            )
num_pipe = make_pipeline(
            (IterativeImputer()),
            (StandardScaler())
            )            
preprocess_pipeline = make_column_transformer(
            (cat_pipe, cat_cols),
            (num_pipe, num_cols)
            )

In [None]:
preprocessor =  preprocess_pipeline.fit(X)
X = preprocessor.transform(X)
X_test = preprocessor.transform(X_test)

In [None]:
ax = sb.boxplot(data=df_train, orient="h", palette="Set2")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)

# check selected features - first 5 features are selected
print(f'first 5 features are selected: {df_train.columns[feat_selector.support_]}')

# check ranking of features
print(f'check ranking of features {feat_selector.ranking_}')

# call transform() on X to filter it down to selected features
X = feat_selector.transform(X)
X_test = feat_selector.transform(X_test)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y)

In [None]:
from tune_sklearn import TuneSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

parameters = {'colsample_bytree': [1.0], 'gamma': [0.5], 'learning_rate': [0.2], 'max_depth': [5], 
 'min_child_weight': [10], 'n_estimators': [700], 'objective': ['reg:linear'], 'reg_alpha': [1.3], 
 'reg_lambda': [1.2], 'silent': [1], 'subsample': [0.9]}

optuna_tune_search = TuneSearchCV(XGBClassifier(X_train, 5000, nfold=StratifiedKFold(10), early_stopping_rounds=50, verbose_eval=1),
    param_distributions=parameters,
    n_trials=5,
    early_stopping=True,
    max_iters=10,
    search_optimization="optuna"
)

optuna_tune_search.fit(X, y)

In [None]:
y_pred = optuna_tune_search.predict(X_test)
y_pred = y_pred.astype(int)
y_pred = pd.DataFrame(y_pred, columns=['Survived'], index=df_test.index)
# saving the dataframe 
y_pred.to_csv('Predictions.csv') 