# Solution

A simple solution using Sklearn pipelines, cross-validation and XGBoost

In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
input_data = pd.read_csv("train.csv", index_col=0)
output_data = pd.read_csv("test.csv", index_col=0)

input_data.shape

(891, 11)

### Data cleaning and train/validation splits

In [3]:
from sklearn.model_selection import train_test_split

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

X = input_data[features]
y = input_data["Survived"]

cat_columns = X.select_dtypes(include="object").columns
num_columns = X.select_dtypes(exclude="object").columns

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=69)
X_test = output_data[features]

X_train

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
508,1,male,,0,0,26.5500,S
202,3,male,,8,2,69.5500,S
181,3,female,,8,2,69.5500,S
79,2,male,0.83,0,2,29.0000,S
73,2,male,21.00,0,0,73.5000,S
...,...,...,...,...,...,...,...
440,2,male,31.00,0,0,10.5000,S
603,1,male,,0,0,42.4000,S
620,2,male,26.00,0,0,10.5000,S
204,3,male,45.50,0,0,7.2250,C


### Pipeline creation

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("numerical", num_transformer, num_columns),
    ("categorical", cat_transformer, cat_columns)
])

In [5]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_estimators= 10, max_depth=7)

model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", xgb_model)
])

model_pipeline

In [6]:
model_pipeline.fit(X_train, y_train)
model_pipeline.score(X_valid, y_valid)

0.8268156424581006

### Generating Results

In [7]:
survived_column = pd.DataFrame(model_pipeline.predict(X_test), columns=["Survived"])
survived_column.index = X_test.index

survived_column.to_csv("my_submission.csv")

### Hyperparameter selection and cross-validation

In [8]:
from sklearn.model_selection import GridSearchCV

parameter_grid = {
    "model__n_estimators": range(5, 30, 1),
    "model__max_depth": range(3, 11, 1)
}

grid_search = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, cv=4)

In [9]:
result = grid_search.fit(X_train, y_train)

In [10]:
result.best_score_

0.8328651685393258

# EDA

Initial analysis shows us 77% of Cabin data is missing. However valuable it might've been, the simplest solution here is simply to drop the Cabin column. The age column can still be used, as only around 20% of the data is missing.

The name and ticket columns will also be dropped when modelling. While additional analysis could be made regarding these columns, such as passengers sharing tickets or not, these are outside the scope of this simple solution, and, by themselves, these columns do not provide value to a potential model.

In [11]:
input_data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [12]:
# rate of NaN entries
input_data.isna().sum() / input_data.shape[0]

Survived    0.000000
Pclass      0.000000
Name        0.000000
Sex         0.000000
Age         0.198653
SibSp       0.000000
Parch       0.000000
Ticket      0.000000
Fare        0.000000
Cabin       0.771044
Embarked    0.002245
dtype: float64

In [13]:
input_data.Ticket.describe()

count        891
unique       681
top       347082
freq           7
Name: Ticket, dtype: object