In [203]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import json

In [204]:
with open('../../components/classification/ind_to_name.json', 'r') as f:
    ind_to_name = json.load(f)
with open('../../components/classification/categories.json', 'r') as f:
    categories = json.load(f)

In [205]:
X = np.load('../../datasets/classification/numpy/X.npy')
y = np.load('../../datasets/classification/numpy/y.npy')

In [206]:
values, counts = np.unique(y, return_counts=True)
mapping = { values[i]:counts[i] for i in range(len(values))}

### train test split

In [207]:
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(X, y, np.arange(len(y)), test_size=0.25, random_state=42)

In [208]:
train_mask = np.zeros(len(y), dtype=bool)
test_mask = np.zeros(len(y), dtype=bool)

train_mask[train_indices] = True
test_mask[test_indices] = True

### knn

In [209]:
knn = KNeighborsClassifier()

In [210]:
knn.fit(X_train, y_train)

In [211]:
y_pred = knn.predict(X_test)

In [212]:
accuracy_score(y_test, y_pred)

0.6340956340956341

In [213]:
knn.predict_proba(X_test)

array([[0. , 0. , 0.8, 0. , 0. , 0.2],
       [0.8, 0. , 0.2, 0. , 0. , 0. ],
       [0. , 0.8, 0. , 0. , 0. , 0.2],
       ...,
       [0. , 0. , 0. , 0.2, 0.6, 0.2],
       [0. , 0. , 1. , 0. , 0. , 0. ],
       [0. , 0. , 0.8, 0. , 0. , 0.2]])

### gcn

In [214]:
from torch_geometric.nn import GCNConv
from torch_geometric.utils import to_dense_adj
import torch
import torch.nn.functional as F
from torch.nn import Linear

In [215]:
data = torch.load('../../datasets/classification/pt/data.pt')

In [216]:
data.y = torch.from_numpy(y)

In [217]:
data.num_features = data.x.shape[1]

In [218]:
data.train_mask = train_mask
data.test_mask = test_mask

In [219]:
data.num_classes = len(torch.unique(data.y))

In [220]:
data

Data(edge_index=[2, 20882], num_nodes=1922, x=[1922, 15], y=[1922], num_features=15, train_mask=[1922], test_mask=[1922], num_classes=6)

In [221]:
data.x = data.x.to(torch.float32)

In [222]:
class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_features, num_classes):
        super().__init__() ## initialize the base class (i.e. torch nn module)
        self.conv1 = GCNConv(num_features, hidden_features) # convolutional layer - performs message passing, embeddings from 15 features to 8
        self.conv2 = GCNConv(hidden_features, num_classes)  # convolutional layer - performs message passing, embeddings from 8 features to 6

    def forward(self, data):
        x, edge_index = data.x, data.edge_index # get the feature mat and adjacency mat
        h = self.conv1(x, edge_index)           # compute the first convolutional layer; 15 -> 8
        h = F.relu(h)                           # applies max(0, x) for each element in the matrix, essentially replaces all negative values by 0
        h = self.conv2(h, edge_index)           # transforms from 8 -> 6
        z = h.argmax(dim=1)                     # find the index w/ highest classifcation
        
        return h, z

In [223]:
def train(model, data):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # update the model parameters
    model.train()                                              # set it to training mode
    logits, _ = model(data)                                    # compute the final embeddings
    
    loss = F.cross_entropy(logits[data.train_mask], data.y[data.train_mask]) # compute the loss on the training mask
    optimizer.zero_grad()                                      # clear gradient
    loss.backward()                                            # back propagation
    optimizer.step()                                           # updates the model's parameters
    
    return loss.item()

In [224]:
def get_degree(data, i):
    row_sums = torch.sum(to_dense_adj(data.edge_index), dim=1)
    return row_sums.flatten()[i].item()

In [225]:
def test(model, data, verbose=False):
    model.eval()
    _, pred = model(data)

    if verbose:
        incorrect_indices = (pred != data.y).nonzero(as_tuple=True)[0]
        test_mask_indices = incorrect_indices[data.test_mask[incorrect_indices]]
        print("Incorrect Predictions Indices:", test_mask_indices.tolist())
        for i in test_mask_indices:
            print(f"Name: {ind_to_name[str(i.item())]}\n\tPredicted: {categories[str(pred[i].item())]}\n\tTrue: {categories[str(data.y[i].item())]}\n\tDegree: {get_degree(data, i.item())}")
    
    acc = (pred[data.test_mask] == data.y[data.test_mask]).sum().item() / data.test_mask.sum().item()
    return acc

In [226]:
model = GCN(data.num_features, 8, data.num_classes)

In [227]:
data.y.dtype

torch.int64

In [228]:
del model
model = GCN(data.num_features, 8, data.num_classes)
for epoch in range(0, 10000):
    loss = train(model, data)
    if epoch % 100 == 0:
        acc = test(model, data)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Accuracy: {acc:.4f}')

Epoch: 000, Loss: 1.9438, Accuracy: 0.1663
Epoch: 100, Loss: 1.7114, Accuracy: 0.2557
Epoch: 200, Loss: 1.6459, Accuracy: 0.4595
Epoch: 300, Loss: 1.5585, Accuracy: 0.5613
Epoch: 400, Loss: 1.4593, Accuracy: 0.5967
Epoch: 500, Loss: 1.3551, Accuracy: 0.6383
Epoch: 600, Loss: 1.2517, Accuracy: 0.6590
Epoch: 700, Loss: 1.1531, Accuracy: 0.6881
Epoch: 800, Loss: 1.0608, Accuracy: 0.7152
Epoch: 900, Loss: 0.9772, Accuracy: 0.7339
Epoch: 1000, Loss: 0.9027, Accuracy: 0.7505
Epoch: 1100, Loss: 0.8371, Accuracy: 0.7672
Epoch: 1200, Loss: 0.7790, Accuracy: 0.7796
Epoch: 1300, Loss: 0.7270, Accuracy: 0.7817
Epoch: 1400, Loss: 0.6811, Accuracy: 0.7963
Epoch: 1500, Loss: 0.6402, Accuracy: 0.8025
Epoch: 1600, Loss: 0.6034, Accuracy: 0.8170
Epoch: 1700, Loss: 0.5712, Accuracy: 0.8274
Epoch: 1800, Loss: 0.5439, Accuracy: 0.8358
Epoch: 1900, Loss: 0.5201, Accuracy: 0.8358
Epoch: 2000, Loss: 0.4997, Accuracy: 0.8420
Epoch: 2100, Loss: 0.4827, Accuracy: 0.8441
Epoch: 2200, Loss: 0.4679, Accuracy: 0.848

In [231]:
test(model, data, True)

Incorrect Predictions Indices: [184, 192, 210, 212, 231, 237, 250, 251, 254, 331, 366, 398, 415, 494, 495, 514, 518, 529, 538, 544, 585, 602, 607, 610, 677, 700, 741, 855, 886, 965, 1052, 1133, 1196, 1208, 1259, 1283, 1293, 1356, 1417, 1440, 1456, 1458, 1501, 1511, 1540, 1559, 1618, 1619, 1625, 1658, 1664, 1730, 1796, 1862]
Name: Teyana Taylor
	Predicted: hip hop
	True: pop
	Degree: 18.0
Name: Allie X
	Predicted: uk pop
	True: pop
	Degree: 5.0
Name: Whitney Houston
	Predicted: hip hop
	True: pop
	Degree: 7.0
Name: DJ Jazzy Jeff & The Fresh Prince
	Predicted: filmi
	True: hip hop
	Degree: 1.0
Name: Matt Simons
	Predicted: pop
	True: uk pop
	Degree: 1.0
Name: Zak Abel
	Predicted: pop
	True: uk pop
	Degree: 8.0
Name: Robinson
	Predicted: electro house
	True: uk pop
	Degree: 4.0
Name: Tobtok
	Predicted: uk pop
	True: electro house
	Degree: 18.0
Name: M-22
	Predicted: pop
	True: uk pop
	Degree: 17.0
Name: Rich The Kid
	Predicted: pop
	True: hip hop
	Degree: 44.0
Name: ILLENIUM
	Predicted: e

0.8877338877338877

In [230]:
row_sums = torch.sum(to_dense_adj(data.edge_index), dim=1)
max_sum_index = torch.argmax(row_sums)
print(ind_to_name[str(max_sum_index.item())], row_sums.flatten()[max_sum_index.item()].item())

Steve Aoki 131.0
