# K-Nearest Neighbors

## Setup

In [2]:
import pandas as pd
from sklearn import tree

In [3]:
# Load the files
existing_customers = pd.read_excel('data/existing-customers.xlsx')
potential_customers = pd.read_excel('data/potential-customers.xlsx')

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


In [4]:
# Define the score metric
def ROI(precision, amount):
    return amount * (88*precision - 25.5*(1-precision))

## Premise
As this assignment is a classification problem KNN can be used to classify the data. Noticeably though the dimensionality of the data is very important for KNN and while at first it might seem that the dimensionality of the data is to high, it is worth considering that there are a lot of samples (so the ratio of features/samples is not that high) and that some clever feature engineering might be able to reduce the dimensionality of the data.

## Feature Engineering

In [5]:
from sklearn.model_selection import train_test_split

def preprocessing_and_feature_selection(
    train_ratio = 0.70,
    validation_ratio = 0.15,
    test_ratio = 0.15,
):
    # Do the feature selection
    data_x = existing_customers[["age", "education", "education-num", "marital-status", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week"]]
    data_y = existing_customers[["class"]]

    # Deal with the NaN entries
    # - By ignoring the variables that contain the Nan entries.

    # Do the conversion from categorical to nominal
    data_x = pd.get_dummies(data_x)
    data_y = pd.get_dummies(data_y, drop_first=True) 

    # Split the data into test, training and validation
    x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio)
    x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

    return x_train, x_val, x_test, y_train, y_val, y_test

x_train, x_val, x_test, y_train, y_val, y_test = preprocessing_and_feature_selection()


In [20]:
y_train = np.ravel(y_train)
y_val = np.ravel(y_val)
y_test = np.ravel(y_test)

In [27]:
# Do a base line test using the data we have
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score


best_clf = None
best_roi = 0
# Loop over different values of k and see which scores the best
for k in range(1, 20):
    # Create the classifier
    clf = KNeighborsClassifier(n_neighbors=k)

    # Train the classifier
    clf.fit(x_train, y_train)

    # Validate the classifier
    y_pred = clf.predict(x_val)
    precision = precision_score(y_val, y_pred)
    roi = ROI(precision, y_pred.sum())
    print(f"k={k} Recall: {precision} ROI: {roi}")

    if roi > best_roi:
        best_roi = roi
        best_clf = clf

# Test the classifier
y_pred = best_clf.predict(x_test)
precision = precision_score(y_test, y_pred)
print(f"k={best_clf.get_params} Recall: {precision} ROI: {roi}")

k=1 Recall: 0.6232758620689656 ROI: 53068.64353448277
k=2 Recall: 0.46551724137931033 ROI: 19736.741379310344
k=3 Recall: 0.6293103448275862 ROI: 50427.543103448275
k=4 Recall: 0.5327586206896552 ROI: 29373.206896551725
k=5 Recall: 0.6362068965517241 ROI: 50306.112931034484
k=6 Recall: 0.5698275862068966 ROI: 34826.95818965517
k=7 Recall: 0.6405172413793103 ROI: 51399.39181034482
k=8 Recall: 0.5741379310344827 ROI: 36134.50086206896
k=9 Recall: 0.6267241379310344 ROI: 47595.41681034482
k=10 Recall: 0.5801724137931035 ROI: 36274.262500000004
k=11 Recall: 0.6310344827586207 ROI: 47644.45344827587
k=12 Recall: 0.5801724137931035 ROI: 36112.864224137935
k=13 Recall: 0.6198275862068966 ROI: 45029.8327586207
k=14 Recall: 0.5844827586206897 ROI: 37694.206034482755
k=15 Recall: 0.6206896551724138 ROI: 45802.293103448275
k=16 Recall: 0.5836206896551724 ROI: 37318.70862068966
k=17 Recall: 0.6094827586206897 ROI: 43763.645689655175
k=18 Recall: 0.5732758620689655 ROI: 35807.963362068964
k=19 Reca

### Prediction

In [28]:
deploy_x = potential_customers[["age", "education", "education-num", "marital-status", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week"]]
deploy_x = pd.get_dummies(deploy_x)

y_pred = best_clf.predict(deploy_x)
amount = y_pred.sum()

print(f"Amount= {amount} ROI={ROI(precision, amount)}")

Amount= 3879 ROI=169042.05970149254


## Conclusions
The obtained ROI is lower than that of the Decision Tree approach.

## Checking how KNN performs if we just select the features used by the decision tree.

In [6]:
from sklearn.model_selection import train_test_split
import numpy as np

def preprocessing_and_aggressive_feature_selection(
    train_ratio = 0.70,
    validation_ratio = 0.15,
    test_ratio = 0.15,
):
    # Do the feature selection
    data_x = existing_customers[["education-num", "marital-status", "capital-gain"]]
    data_y = existing_customers[["class"]]

    # Deal with the NaN entries
    # - By ignoring the variables that contain the Nan entries.

    # Do the conversion from categorical to nominal
    data_x = pd.get_dummies(data_x)
    data_y = pd.get_dummies(data_y, drop_first=True) 

    # Split the data into test, training and validation
    x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio)
    x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

    # Convert the data to numpy arrays so that KNN is happy.
    y_train = np.ravel(y_train)
    y_val = np.ravel(y_val)
    y_test = np.ravel(y_test)
    return x_train, x_val, x_test, y_train, y_val, y_test

x_train, x_val, x_test, y_train, y_val, y_test = preprocessing_and_aggressive_feature_selection()


In [7]:
# Find the best k value
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score


best_clf = None
best_roi = 0
# Loop over different values of k and see which scores the best
for k in range(1, 20):
    # Create the classifier
    clf = KNeighborsClassifier(n_neighbors=k)

    # Train the classifier
    clf.fit(x_train, y_train)

    # Validate the classifier
    y_pred = clf.predict(x_val)
    precision = precision_score(y_val, y_pred)
    roi = ROI(precision, y_pred.sum())
    print(f"k={k} Recall: {precision} ROI: {roi}")

    if roi > best_roi:
        best_roi = roi
        best_clf = clf

# Test the classifier
y_pred = best_clf.predict(x_test)
precision = precision_score(y_test, y_pred)
print(f"k={best_clf.get_params} Recall: {precision} ROI: {roi}")

k=1 Recall: 0.4534412955465587 ROI: 22745.854251012148
k=2 Recall: 0.4064777327935223 ROI: 13619.246963562753
k=3 Recall: 0.4785425101214575 ROI: 24809.348987854253
k=4 Recall: 0.4744939271255061 ROI: 23081.01943319838
k=5 Recall: 0.5740890688259109 ROI: 38746.94979757085
k=6 Recall: 0.5676113360323887 ROI: 36860.92064777328
k=7 Recall: 0.5676113360323887 ROI: 36860.92064777328
k=8 Recall: 0.5522267206477732 ROI: 32939.47125506072
k=9 Recall: 0.5854251012145749 ROI: 39840.21376518219
k=10 Recall: 0.5522267206477732 ROI: 32939.47125506072
k=11 Recall: 0.5530364372469636 ROI: 33095.43643724697
k=12 Recall: 0.5506072874493927 ROI: 32739.625506072873
k=13 Recall: 0.5506072874493927 ROI: 32739.625506072873
k=14 Recall: 0.5506072874493927 ROI: 32739.625506072873
k=15 Recall: 0.5538461538461539 ROI: 33475.93846153846
k=16 Recall: 0.5538461538461539 ROI: 33475.93846153846
k=17 Recall: 0.5554655870445344 ROI: 33940.99109311741
k=18 Recall: 0.5522267206477732 ROI: 33125.359919028335
k=19 Recall:

### Prediction

In [9]:
deploy_x = potential_customers[["education-num", "marital-status", "capital-gain"]]
deploy_x = pd.get_dummies(deploy_x)

y_pred = best_clf.predict(deploy_x)
amount = y_pred.sum()

print(f"Amount= {amount} ROI={ROI(precision, amount)}")
# Previous best: Amount= 3879 ROI=169042.05970149254

Amount= 2956 ROI=110208.72428694903


### Conclusion
It is not better than the previous approach, interestingly enough.