In [1]:
# Import libraries
import math
import operator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

df = pd.read_csv("data baru 50.csv")
df = shuffle(df, random_state=1)
df.head()

Unnamed: 0,no,playlist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
547,548,5,0.906,0.798,1,0.858692,1,0.0716,0.0431,0.000847,0.0474,0.741,0.434564,1
754,755,7,0.302,0.0134,1,0.306938,1,0.0418,0.991,0.925,0.129,0.433,0.395555,1
1283,1284,12,0.523,0.909,1,0.896943,1,0.0441,0.000459,0.00185,0.381,0.669,0.554767,1
1128,1129,11,0.803,0.47,1,0.79269,1,0.345,0.505,0.0,0.0887,0.793,0.407079,1
1063,1064,10,0.733,0.744,1,0.941361,1,0.0309,0.161,0.0,0.0389,0.667,0.434416,1


In [2]:
data = df.drop(columns=['no', 'playlist'])
data.head()

KeyError: "['genre'] not found in axis"

In [3]:
target = df.playlist
target.head()

547      5
754      7
1283    12
1128    11
1063    10
Name: playlist, dtype: int64

In [4]:
min_max_scaler = MinMaxScaler()

In [5]:
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

def getWeights(trainingSet, testInstance, trainingTarget, k, validities):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        weight = validities[x] * (1 / (dist + 0.5))
        distances.append((trainingTarget[x], dist, weight))
    distances.sort(key=operator.itemgetter(1))
    weights = []
    for x in range(k):
        weights.append([distances[x][0], distances[x][2]])
    return weights

def getNeighbors(trainingSet, testInstance, trainingTarget, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingTarget[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

def getResponse(weights):
    max = 0
    result = 0
    weightVotes = {}
    for x in range(len(weights)):
        response = weights[x][0]
        if response in weightVotes:
            weightVotes[response] += weights[x][1]
        else:
            weightVotes[response] = weights[x][1]
    for key, value in weightVotes.items():
        if(value > max):
            result = key
            max = value
    return result

def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

def getValidities(trainingSet, trainingTarget, h):
    validities = []
    for i in range(len(trainingSet)):
        X = np.delete(trainingSet, i, 0)
        neighbors = getNeighbors(X, trainingSet[i], trainingTarget, h)
        s = 0
        for y in range(len(neighbors)):
            if(trainingTarget[i] == neighbors[y]):
                s += 1
        validities.append(s/h)
    return validities

def cm_analysis(y_true, y_pred, labels, ymap=None, figsize=(10,10)):
    """
    Generate matrix plot of confusion matrix with pretty annotations.
    The plot image is saved to disk.
    args: 
      y_true:    true label of the data, with shape (nsamples,)
      y_pred:    prediction of the data, with shape (nsamples,)
      filename:  filename of figure file to save
      labels:    string array, name the order of class labels in the confusion matrix.
                 use `clf.classes_` if using scikit-learn models.
                 with shape (nclass,).
      ymap:      dict: any -> string, length == nclass.
                 if not None, map the labels & ys to more understandable strings.
                 Caution: original y_true, y_pred and labels must align.
      figsize:   the size of the figure plotted.
    """
    if ymap is not None:
        y_pred = [ymap[yi] for yi in y_pred]
        y_true = [ymap[yi] for yi in y_true]
        labels = [ymap[yi] for yi in labels]
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=labels, columns=labels)
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    sns.heatmap(cm, annot=annot, fmt='', ax=ax)

In [6]:
# Proses MKNN
X = data.values
y = target.values
kf = KFold(n_splits=10)
pca = PCA(n_components=2)
k = 3
h = 3
fold = 0
accuracy_total = 0

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train_scale = min_max_scaler.fit_transform(X_train)
    X_test_scale = min_max_scaler.transform(X_test)
    
    pca.fit(X_train_scale)
    X_train_pca = pca.fit_transform(X_train_scale)
    X_test_pca = pca.fit_transform(X_test_scale)
    
    index = np.hstack((train_index, test_index))
    index = np.reshape(index, (len(index), 1))
    X_pca = np.vstack((X_train_pca, X_test_pca))
    Y_pca = np.hstack((y_train, y_test))
    Y_pca = np.reshape(Y_pca, (len(Y_pca), 1))
    data_pca = np.hstack((index, X_pca, Y_pca))
    
    predictions=[]
    
    fold += 1
    np.savetxt('Klasifikasi Data/KNN MKNN dengan PCA/Data PCA MKNN Fold ' + str(fold) + '.csv', data_pca, delimiter=",")
    print('KFold ' + str(fold))
    print('======================================')
    
    validities = getValidities(X_train_pca, y_train, h)
    
    for i in range(len(X_test_pca)):
        weights = getWeights(X_train_pca, X_test_pca[i], y_train, k, validities)
        result = getResponse(weights)
        predictions.append(result)
        
        print('> ' + str(i + 1) + ' predicted = ' + repr(result) + ', actual = ' + repr(y_test[i]))
        
    accuracy = accuracy_score(y_test, predictions)
    accuracy_total += accuracy
    cm_analysis(y_test, predictions, target.unique().sort(), ymap=None, figsize=(10,10))
    plt.savefig('Klasifikasi Data/Gambar Confussion Matrix dengan PCA/Confussion Matrix MKNN Fold ' + str(fold) + '.jpg')
    
    print('---------------------------------------')
    print('KFold ' + str(fold) + ', Accuracy ' + str(accuracy))
    print('Klasifikasi Data/KNN MKNN dengan PCA/Data PCA MKNN Fold ' + str(fold) + '.csv')
    print('\n')

print('======================================')
print('Average Accuracy = ' + str(accuracy_total/fold))
print('======================================')
print('\n')

NameError: name 'data' is not defined