In [50]:
import seaborn as sns
import sklearn as sk
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

In [44]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,


In [47]:
#Inspect the data set. There are 5 columns which contain ‘easy to work with’ qualitative or quantitative variables. Which are they?
#sex, age, sibsp, parch, pclass

#Create dummy variables for the qualitative variable and add them.
dummies = pd.get_dummies(df['Sex'])
df = pd.concat([df, dummies], axis=1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,female,male
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,,0,1


In [58]:
df_subset = df[['SibSp','Parch', 'female', 'Age', 'Pclass', 'Survived']]
df_subset.head()

Unnamed: 0,SibSp,Parch,female,Age,Pclass,Survived
0,1,0,0,22.0,3,0
1,1,0,1,38.0,1,1
2,0,0,1,26.0,3,1
3,1,0,1,35.0,1,1
4,0,0,0,35.0,3,0


In [60]:
#getting rid of the empty rows
df_subset = df_subset.dropna()
df_subset.head()

Unnamed: 0,SibSp,Parch,female,Age,Pclass,Survived
0,1,0,0,22.0,3,0
1,1,0,1,38.0,1,1
2,0,0,1,26.0,3,1
3,1,0,1,35.0,1,1
4,0,0,0,35.0,3,0


In [61]:
#Split the data into a training and data set.

X = df_subset[['SibSp','Parch', 'female', 'Age', 'Pclass']] #create the X matrix
X = normalize(X) #normalize the matrix to put everything on the same scale
y = df_subset['Survived'] #create the y-variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #split the data, store it into different variables

In [62]:
# Train a kNN-algorithm on the data, with k = 3

knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(X_train, y_train) #this fits the k-nearest neigbor model with the train data
knn.score(X_test, y_test) #calculate the fit on the test data

0.8

In [63]:
y_test_pred = knn.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_test_pred) #creates a "confusion matrix"
cm

array([[111,  23],
       [ 20,  61]])

In [64]:
#In order to read it easily , let's make a dataframe out of it, and add labels to it.
conf_matrix = pd.DataFrame(cm, index=['NotSurvived', 'Survived'], columns = ['NotSurvived_p', 'Survived_p']) 
conf_matrix

Unnamed: 0,NotSurvived_p,Survived_p
NotSurvived,111,23
Survived,20,61


In [4]:
# Calculate accuracy, precision and recall for survival on the test set
recall = 61 / (20 + 61)
recall

0.7530864197530864

In [3]:
#how much of the predicted survived did survice (columns)
precision = 61 / (23 + 61)
precision

0.7261904761904762

In [71]:
accuracy = (61 + 111) / (111 + 23 + 20 + 61)
accuracy

0.8