# K Nearest Neighbors

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

## Get the Data 

In [None]:
#https://www.kaggle.com/c/titanic/data

url = 'https://drive.google.com/uc?id=1zyYJMa75UXLjmODPRnl4_UrLJ0JKmEOC'
titanic_data = pd.read_csv(url)

In [None]:
titanic_data.head()

In [None]:
titanic_data.info()

## Data Preprocessing

In [None]:
titanic_data.isnull().sum()

In [None]:
titanic_data = titanic_data.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis = 1)

In [None]:
titanic_data.head()

In [None]:
# fill missing age with mean
age_mean = titanic_data['Age'].mean()
titanic_data['Age'] = titanic_data['Age'].fillna(age_mean)

In [None]:
set(titanic_data['Embarked'])

In [None]:
titanic_data['Embarked'].describe()

In [None]:
titanic_data['Embarked'].value_counts()

In [None]:
# fill missing 'Embarked'
titanic_data['Embarked'] = titanic_data['Embarked'].fillna('S')

In [None]:
# encode 'Sex' with digits
titanic_data['Sex'] = LabelEncoder().fit_transform(titanic_data['Sex'])

In [None]:
# encode 'Embarked' with digits
titanic_data['Embarked'] = LabelEncoder().fit_transform(titanic_data['Embarked'])

In [None]:
titanic_data.head()

In [None]:
titanic_data.info()

## Split Data to Test and Train Sets

In [None]:
X = titanic_data.drop(['Survived'], axis = 1)
y = titanic_data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

## Standardize the Variables

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)

In [None]:
scaled_X_train = scaler.transform(X_train) 
scaled_X_test = scaler.transform(X_test) 

In [None]:
scaled_X_train

## KNN model

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

In [None]:
knn.fit(scaled_X_train, y_train)

In [None]:
pred = knn.predict(scaled_X_test)

In [None]:
### compare real and predicted

In [None]:
y_test

In [None]:
pred

In [None]:
list(zip(pred, y_test))
pred2 = np.array(pred, dtype=np.int64)
y_test2 = np.array(y_test, dtype=np.int64)
diff = np.abs(pred2 - y_test2)
diff.size // 170
count_errors = np.sum(diff == 1)
# count_errors
count_errors / diff.size

## Evaluation

In [None]:
cm = confusion_matrix(y_test,pred)
cm

# [[TN  FP]
#  [FN  TP]]
# 118 passengers correctly predicted as not survived (0).
# 9 passengers predicted as survived (1) but actually didn’t survive.
# 20 passengers predicted as not survived (0) but actually survived.
# 32 passengers correctly predicted as survived (1).

In [None]:
print(classification_report(y_test,pred))

In [None]:
accuracy_score(y_test,pred)

## Choosing a K Value

In [None]:
error_rate = []

for i in range(1,30):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(scaled_X_train,y_train)
    pred_i = knn.predict(scaled_X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,30), error_rate, marker='o', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.grid()

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)

knn.fit(scaled_X_train,y_train)
pred = knn.predict(scaled_X_test)

print(confusion_matrix(y_test,pred))
print()
print(classification_report(y_test,pred))