In [208]:
import pandas as pd

In [209]:
# configure pandas
pd.set_option('display.max_rows', 500)

In [210]:
# read data
input_df = pd.read_csv('dataset/titanic.csv')

# list columns
print(input_df.columns)

print(input_df.shape)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
(891, 12)


In [211]:
# list NaN values
input_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [212]:
# we don't care about Cabin and embarked because we skip those columns for now
# the shape of df is 891, only 177 (20%) NaNs - drop them -> add to preprocess function

In [220]:
# take subset of columns and preprocess them
def preprocess(input_df):
    columns = ['Sex', 'Age', 'Survived']
    df = input_df.loc[:, columns]
    df['Sex'] = df['Sex'].astype('category').cat.codes
    # print category codes
    sex_cat_mapping = dict(enumerate(input_df['Sex'].astype('category').cat.categories))
    print(f'sex cat mapping: {sex_cat_mapping}')
    df = df.dropna()
    return df

In [221]:
df = preprocess(input_df)
df.head()

sex cat mapping: {0: 'female', 1: 'male'}


Unnamed: 0,Sex,Age,Survived
0,1,22.0,0
1,0,38.0,1
2,0,26.0,1
3,0,35.0,1
4,1,35.0,0


In [222]:
# split into train and test set
split_idx = 500

train_df = df.loc[:split_idx]
test_df = df.loc[split_idx:]

In [223]:
from sklearn.neighbors import KNeighborsClassifier

target_colname = 'Survived'
X = train_df.drop(columns=[target_colname])
y = train_df[target_colname]

clf = KNeighborsClassifier(3)
clf.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [224]:
X_test = test_df.drop(columns=[target_colname])
y_test = test_df[target_colname]

In [225]:
mean(clf.predict(X_test) == y_test)

0.6962025316455697

In [231]:
for num_neighbors in range(1, 30, 2):
    print(num_neighbors)
    
    clf = KNeighborsClassifier(num_neighbors)
    clf.fit(X, y)
    
    y_pred = clf.predict(X_test)
    print(mean(y_pred == y_test))

1
0.6613924050632911
3
0.6962025316455697
5
0.7278481012658228
7
0.7215189873417721
9
0.7183544303797469
11
0.7088607594936709
13
0.7278481012658228
15
0.7151898734177216
17
0.7088607594936709
19
0.6930379746835443
21
0.6835443037974683
23
0.6835443037974683
25
0.6455696202531646
27
0.6424050632911392
29
0.6265822784810127


In [None]:
# todo: https://towardsdatascience.com/machine-learning-basics-with-the-k-nearest-neighbors-algorithm-6a6e71d01761