In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
x = pd.read_csv('ml-25m/pca_final.csv')
y = pd.read_csv('ml-25m/rating_only.csv')

In [None]:
#split data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size=0.2, random_state=42)

In [None]:
#create KNN model
knn = KNeighborsClassifier(n_neighbors=5)


In [None]:
#fit the model
knn.fit(x_train, y_train)


In [None]:
#evaluate the model
y_pred = knn.predict(x_test)

In [None]:
#classification report
print(classification_report(y_test, y_pred))


In [None]:
#confusion matrix
#print(confusion_matrix(y_test, y_pred))
plt.figure(figsize=(10,10))
plt.matshow(confusion_matrix(y_test, y_pred),fignum=1)
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues', xticklabels=['bad', 'good', 'ok'], yticklabels=['bad', 'good', 'ok'])
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')

In [None]:
unique, counts = np.unique(y, return_counts=True)


In [None]:
unique

In [None]:
counts

In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

In [None]:
y = label_binarize(y, classes=unique)
n_classes = len(unique)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
clf = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=5))
y_score = clf.fit(x_train, y_train).predict_proba(x_test)

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

plt.figure(figsize=(10,10))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
# Plot of a ROC curve for a specific class
for i in range(n_classes):
    
    plt.plot(fpr[i], tpr[i], label='ROC curve for class %s (area = %0.2f)' % (unique[i], roc_auc[i]))
    
    
    
plt.legend(loc="lower right")   
plt.show()
plt.savefig('roc_curveKNN.png')


In [None]:
def run_knn(x, y, k, metric):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
    knn.fit(x, y)
    y_pred = knn.predict(x_test)
    print(classification_report(y_test, y_pred))

    with open('knn\\reportKNN%s%s.txt' %(metric, k), 'w') as f:
        print(classification_report(y_test, y_pred), file=f)

    plt.figure(figsize=(10,10))
    plt.matshow(confusion_matrix(y_test, y_pred),fignum=1)
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues', xticklabels=['bad', 'good', 'ok'], yticklabels=['bad', 'good', 'ok'])
    plt.title('Confusion Matrix for KNN metric: ' + str(metric) + ' and K: ' + str(k)) 
    plt.savefig('knn\confusion_matrixKNN%s%s.png' % (metric, k))
    plt.show()

    unique, counts = np.unique(y, return_counts=True)

    y = label_binarize(y, classes=unique)
    n_classes = len(unique)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    
    clf = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=k, metric=metric))
    y_score = clf.fit(x_train, y_train).predict_proba(x_test)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])


    fig, ax = plt.subplots(figsize=(10,10))
    ax.plot([0, 1], [0, 1], 'k--')
    ax.axis(xmin=0, xmax=1, ymin=0, ymax=1.05)
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic for KNN metric: ' + str(metric) + ' and K: ' + str(k))
    # Plot of a ROC curve for a specific class
    for i in range(n_classes):
        
        ax.plot(fpr[i], tpr[i], label='ROC curve for class %s (area = %0.2f)' % (unique[i], roc_auc[i]))
        
    ax.legend(loc="lower right")
    fig.savefig('knn\\roc_curveKNN%s%s.png' % (metric, k))
    plt.show()

    
    

In [None]:
for i in [5, 10, 20]:
    run_knn(x, y, i, 'euclidean')

In [None]:
for i in [5,10,20]:
    run_knn(x, y, i, 'manhattan')

In [None]:
for i in [5,10,20]:
    run_knn(x, y, i, 'minkowski')