In [21]:
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

## Day 31 Lecture 2 Assignment

In this assignment, we will learn about the weighting and scaling with the K nearest neighbor algorithm. We will use the acute nephritis dataset loaded below and analyze the model generated for this dataset.

In [22]:
%matplotlib inline

import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier

import warnings
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings

import ssl

ssl._create_default_https_context = ssl._create_unverified_context

<IPython.core.display.Javascript object>

In [23]:
# columns:
# Temperature of patient { 35C-42C }
# Occurrence of nausea { yes, no }
# Lumbar pain { yes, no }
# Urine pushing (continuous need for urination) { yes, no }
# Micturition pains { yes, no }
# Burning of urethra, itch, swelling of urethra outlet { yes, no }
# decision: Nephritis of renal pelvis origin { yes, no }

cols = [
    "temp",
    "nausea",
    "lumbar_pain",
    "urine_pushing",
    "micturition_pains",
    "burning",
    "nephritis",
]
nephritis = pd.read_csv(
    "https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/acute.csv",
    names=cols,
)

<IPython.core.display.Javascript object>

Recall that we need to check for missing data and create dummy variables from the non-numeric columns. Perform both steps below:

In [24]:
for col in nephritis.columns:
    if col != "temp":
        nephritis[col] = (nephritis[col] == "yes").astype(int)

<IPython.core.display.Javascript object>

In [25]:
nephritis

Unnamed: 0,temp,nausea,lumbar_pain,urine_pushing,micturition_pains,burning,nephritis
35,5,0,1,0,0,0,0
35,9,0,0,1,1,1,0
35,9,0,1,0,0,0,0
36,0,0,0,1,1,1,0
36,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
41,4,0,1,1,0,1,1
41,5,0,0,0,0,0,0
41,5,1,1,0,1,0,1
41,5,0,1,1,0,1,1


<IPython.core.display.Javascript object>

Create a train test split with 20% of the data in the test subsample.

In [26]:
X = nephritis.drop(columns=["nephritis"])
y = nephritis["nephritis"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=11
)

<IPython.core.display.Javascript object>

Scale only the independent variables using the minmax scaler.

In [27]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [28]:
print_vif(X)

VIF results
-------------------------------
const                9.130360
temp                 1.069727
nausea               4.079909
lumbar_pain          2.725381
urine_pushing        1.921461
micturition_pains    3.175521
burning              2.012064
dtype: float64
-------------------------------



<IPython.core.display.Javascript object>

In [29]:
transformer = ColumnTransformer(
    [("scale_temp", StandardScaler(), ["temp"])], remainder="passthrough"
)

transformer.fit(X_train)

X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

X_train = pd.DataFrame(X_train, columns=X.columns, index=y_train.index)
X_test = pd.DataFrame(X_test, columns=X.columns, index=y_test.index)

<IPython.core.display.Javascript object>

Create a KNN model for our scaled data with k=5 and report the accuracy score.

In [30]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

<IPython.core.display.Javascript object>

In [31]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(train_score, "\n")
print(test_score)

1.0 

1.0


<IPython.core.display.Javascript object>

In [32]:
# from tqdm.notebook import tqdm

# ks = range(1, 76)

# train_scores = []
# test_scores = []
# for k in tqdm(ks):
#     model = KNeighborsClassifier(n_neighbors=k)
#     model.fit(X_train, y_train)

#     train_score = model.score(X_train, y_train)
#     test_score = model.score(X_test, y_test)

#     train_scores.append(train_score)
#     test_scores.append(test_score)


# plt.plot(ks, train_scores, label="train")
# plt.plot(ks, test_scores, label="test")
# plt.title("Score for Each k")
# plt.xlabel("k")
# plt.ylabel("score")
# plt.legend()
# plt.show()

<IPython.core.display.Javascript object>

When generating a KNN model, we can use the weighted model by setting `weights='distance'`. We can also write our own custom weights function.

Write a custom weight function that assigns the weight of 1/sqrt(distance) and use this function in your model. Report the accuracy score.

Hint: Use the `_get_weights` function in scikit learn as a resource. The code is <a href="https://github.com/scikit-learn/scikit-learn/blob/fdbaa58acbead5a254f2e6d597dc1ab3b947f4c6/sklearn/neighbors/base.py#L63" title="_get_weights">here</a>.

In [33]:
def custom_weight(dist):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        return 1 / np.sqrt(dist)


model = KNeighborsClassifier(n_neighbors=10, weights=custom_weight)
model.fit(X_train, y_train)

model = KNeighborsClassifier(n_neighbors=5, weights=custom_weight)
model.fit(X_train, y_train)

train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"train_score: {train_score}")
print(f"test_score: {test_score}")

train_score: 1.0
test_score: 1.0


<IPython.core.display.Javascript object>