In [4]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

X_train = pd.read_csv("dataset/train_features.csv")
y_train = pd.read_csv("dataset/train_labels.csv")
X_test = pd.read_csv("dataset/valid_features.csv")
y_test = pd.read_csv("dataset/valid_labels.csv")

cleanup_nums = {"genre": 
                {"classic pop and rock": 1, 
                 "dance and electronica": 2,
                 "folk":3,
                 "jazz and blues":4,
                 "metal":5,
                 "pop":6,
                 "punk":7,
                 "soul and reggae":8
                }}
y_train.replace(cleanup_nums, inplace=True)
y_test.replace(cleanup_nums, inplace=True)

cv = TfidfVectorizer(min_df=1, stop_words='english')
X_train = cv.fit_transform(X_train["tags"])
X_test = cv.transform(X_test["tags"])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [2]:
knn = KNeighborsClassifier(n_neighbors=3)
y_train["genre"] = y_train["genre"].astype('int')
knn.fit(X_train, y_train["genre"])
preds=knn.predict(X_test)

# let's how our model performed
print("Classificaion report: \n")
print(classification_report(y_test["genre"], preds))
print("Confusion matrix: \n")
print(confusion_matrix(y_test["genre"], preds))

print("\n\n\n")

def evaluate(pred, true):
    CM = metrics.confusion_matrix(true, pred) # Confusion Matrix
    Acc = metrics.accuracy_score(true, pred) # Accuracy
    precf1 = metrics.precision_recall_fscore_support(true, pred) # Precision, Recall and F1-score
    return CM, Acc, precf1

CM, Acc, precf1 = evaluate(preds, y_test["genre"])

correct = 0
for i in range(len(y_test)):
    correct = correct + int(preds[i]==y_test["genre"][i])
    
print("Confusion Matrix:\n{}\naccuracy: {}\naccuracy by sklearn.metric: {}\nprecision: {}\nrecall: {}\nF1: {}\n".format(
                                                CM,
                                                correct / len(y_test), 
                                                Acc,
                                                precf1[0],
                                                precf1[1],
                                                precf1[2]))

Classificaion report: 

              precision    recall  f1-score   support

           1       0.19      0.60      0.29        55
           2       0.08      0.04      0.06        45
           3       0.38      0.36      0.37        64
           4       0.20      0.02      0.04        44
           5       0.61      0.17      0.26        66
           6       0.92      0.92      0.92        74
           7       0.68      0.59      0.63        44
           8       0.43      0.41      0.42        58

    accuracy                           0.42       450
   macro avg       0.44      0.39      0.37       450
weighted avg       0.47      0.42      0.40       450

Confusion matrix: 

[[33  6  3  0  0  0  1 12]
 [22  2 10  2  1  0  2  6]
 [27  3 23  0  3  1  0  7]
 [28  3  7  1  2  0  0  3]
 [25  8  8  2 11  3  5  4]
 [ 1  0  1  0  0 68  4  0]
 [15  1  0  0  0  2 26  0]
 [23  2  8  0  1  0  0 24]]




Confusion Matrix:
[[33  6  3  0  0  0  1 12]
 [22  2 10  2  1  0  2  6]
 [27  3 23  