In [50]:
# imports
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

# configure pandas
pd.set_option('display.max_rows', 500)

In [51]:
# read data
input_df = pd.read_csv('dataset/titanic.csv')

# simple data exploration
print(f'columns: {input_df.columns}\n')
print(f'shape: {input_df.shape}')
input_df.head()

columns: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

shape: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


We will predict if person survived based on sex and age attributes
To make things simple as possible - we will take only 3 columns - Sex, Age and Survived (label)

In [52]:
df = input_df[['Sex', 'Age', 'Survived']]
df.head()

Unnamed: 0,Sex,Age,Survived
0,male,22.0,0
1,female,38.0,1
2,female,26.0,1
3,female,35.0,1
4,male,35.0,0


Let's check for NaN values

In [53]:
df.isnull().sum()

Sex           0
Age         177
Survived      0
dtype: int64

We can see that there are 177 NaN values - which compared to 891 rows are around 20% of data.
Let's drop them to save only clear, high valuable data without missing values

In [54]:
df = df.dropna()
df.isnull().sum()

Sex         0
Age         0
Survived    0
dtype: int64

Before applying kNN we need to handle Sex categorical column - let's use one-hot encoding on this column values

In [55]:
sex_cat_mapping = dict(enumerate(df['Sex'].astype('category').cat.categories))
df['Sex'] = df['Sex'].astype('category').cat.codes
print(f'sex category mapping: {sex_cat_mapping}')
df.head()

sex category mapping: {0: 'female', 1: 'male'}


Unnamed: 0,Sex,Age,Survived
0,1,22.0,0
1,0,38.0,1
2,0,26.0,1
3,0,35.0,1
4,1,35.0,0


Now we have our dataset prepared for applying some machine learning :) In this case we will use kNN

Let's split our data to train and test dataset

In [56]:
split_idx = 500

train_df = df.loc[:split_idx]
test_df = df.loc[split_idx:]

Split our data into input dataframe and label series

In [57]:
target_colname = 'Survived'
X = train_df.drop(columns=[target_colname])
y = train_df[target_colname]

Load train data into kNN

In [58]:
k = 3
clf = KNeighborsClassifier(k)
clf.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

Prepare inputs for test dataset

In [59]:
X_test = test_df.drop(columns=[target_colname])
y_test = test_df[target_colname]

Make predictions for test set

In [60]:
y_pred = clf.predict(X_test)

Compare y_test with ground truth values with y_pred to get predictions quality

In [62]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.82      0.76       184
           1       0.68      0.52      0.59       132

    accuracy                           0.70       316
   macro avg       0.69      0.67      0.67       316
weighted avg       0.69      0.70      0.69       316

