In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

X_train = pd.read_csv("dataset/train_features.csv")
y_train = pd.read_csv("dataset/train_labels.csv")
X_test = pd.read_csv("dataset/valid_features.csv")
y_test = pd.read_csv("dataset/valid_labels.csv")

cleanup_nums = {"genre": 
                {"classic pop and rock": 1, 
                 "dance and electronica": 2,
                 "folk":3,
                 "jazz and blues":4,
                 "metal":5,
                 "pop":6,
                 "punk":7,
                 "soul and reggae":8
                }}

y_train.replace(cleanup_nums, inplace=True)
y_test.replace(cleanup_nums, inplace=True)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

X_train = X_train.drop(["trackID","loudness","title","tempo", "time_signature","key","mode","duration", "tags"], 1)
X_test = X_test.drop(["trackID","loudness","title","tempo", "time_signature","key","mode","duration", "tags"], 1)

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Unnamed: 0,trackID,title,tags,loudness,tempo,time_signature,key,mode,duration,vect_1,...,vect_139,vect_140,vect_141,vect_142,vect_143,vect_144,vect_145,vect_146,vect_147,vect_148
0,8424,The Calamity [Explicit],"the, you, to, a, not, in, of, that, are, we, w...",-2.824,131.969,4,0,1,298.13506,52.050795,...,0.001097,0.001161,0.001225,0.001251,0.001238,0.001202,0.001130,0.001040,0.104525,0.335818
1,7923,Try To Find Me,"i, the, you, to, and, a, me, it, not, in, my, ...",-11.249,136.510,4,4,1,248.03220,38.488583,...,0.000117,0.000134,0.000147,0.000161,0.000157,0.000126,0.000121,0.000134,0.124080,1.553742
2,2314,Poor Old Dirt Farmer,"the, to, and, a, in, is, on, all, no, have, bu...",-6.932,72.878,4,7,1,231.00036,45.424130,...,0.000962,0.001203,0.001214,0.001152,0.001025,0.000933,0.000838,0.000867,0.311885,7.835358
3,810,Texas Flood,"i, the, you, to, and, a, me, it, not, in, my, ...",-12.551,75.381,3,6,1,390.29506,42.546616,...,0.000167,0.000183,0.000198,0.000205,0.000203,0.000197,0.000183,0.000164,0.115680,0.395675
4,439,Chopsticks Medley,"i, to, and, me, love, but, with, oh, they, she...",-14.566,134.331,4,5,0,286.11873,36.161980,...,0.000084,0.000089,0.000094,0.000100,0.000098,0.000089,0.000083,0.000078,0.186119,1.472794
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7673,6419,Stop That Train,"i, the, to, and, a, it, not, my, that, do, am,...",-10.798,146.624,1,7,1,236.48608,40.258501,...,0.001282,0.001359,0.001422,0.001447,0.001430,0.001438,0.001437,0.001474,0.184104,2.765411
7674,6811,In A Close Encounter,"i, the, to, a, in, my, of, so, with, they, see...",-3.759,153.760,4,6,1,171.78077,51.767393,...,0.000588,0.000628,0.000633,0.000659,0.000643,0.000619,0.000562,0.000526,0.075420,0.216842
7675,1474,Our Lady Of Sorrow (LP Version),"i, the, you, to, and, a, not, in, is, of, that...",-11.883,98.596,4,10,1,255.58159,40.285352,...,0.000163,0.000159,0.000164,0.000173,0.000172,0.000164,0.000154,0.000163,0.169215,1.786019
7676,8343,Section X,"i, the, you, to, and, a, me, in, my, is, of, y...",-3.518,101.639,4,4,0,547.94404,52.670728,...,0.000924,0.000991,0.000968,0.000987,0.000976,0.000925,0.000884,0.000850,0.181243,2.548783


In [2]:
knn = KNeighborsClassifier(n_neighbors=10)
y_train["genre"] = y_train["genre"].astype('int')
knn.fit(X_train, y_train["genre"])
preds=knn.predict(X_test)

# let's how our model performed
print("Classificaion report: \n")
print(classification_report(y_test["genre"], preds))
print("Confusion matrix: \n")
print(confusion_matrix(y_test["genre"], preds))

print("\n\n\n")

def evaluate(pred, true):
    CM = metrics.confusion_matrix(true, pred) # Confusion Matrix
    Acc = metrics.accuracy_score(true, pred) # Accuracy
    precf1 = metrics.precision_recall_fscore_support(true, pred) # Precision, Recall and F1-score
    return CM, Acc, precf1

CM, Acc, precf1 = evaluate(preds, y_test["genre"])

correct = 0
for i in range(len(y_test)):
    correct = correct + int(preds[i]==y_test["genre"][i])
    
print("Confusion Matrix:\n{}\naccuracy: {}\naccuracy by sklearn.metric: {}\nprecision: {}\nrecall: {}\nF1: {}\n".format(
                                                CM,
                                                correct / len(y_test), 
                                                Acc,
                                                precf1[0],
                                                precf1[1],
                                                precf1[2]))

Classificaion report: 

              precision    recall  f1-score   support

           1       0.23      0.58      0.33        55
           2       0.71      0.11      0.19        45
           3       0.22      0.41      0.28        64
           4       0.33      0.11      0.17        44
           5       0.93      0.76      0.83        66
           6       0.40      0.14      0.20        74
           7       0.51      0.43      0.47        44
           8       0.69      0.66      0.67        58

    accuracy                           0.41       450
   macro avg       0.50      0.40      0.39       450
weighted avg       0.50      0.41      0.40       450

Confusion matrix: 

[[32  0 16  4  0  0  0  3]
 [15  5  9  0  1  6  2  7]
 [33  0 26  1  0  2  1  1]
 [11  0 21  5  0  0  3  4]
 [ 2  1  0  0 50  1 12  0]
 [26  0 33  3  0 10  0  2]
 [16  0  5  0  3  1 19  0]
 [ 3  1  9  2  0  5  0 38]]




Confusion Matrix:
[[32  0 16  4  0  0  0  3]
 [15  5  9  0  1  6  2  7]
 [33  0 26  