In [None]:
from libs.feature_extraction.vectorizers import vectorizers
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from torch.utils.data import Dataset, DataLoader

In [None]:
vectorizers

In [None]:
embedding_paths = json.load(open('embedding_paths.json', 'r'))

In [None]:
vectors = {
    vectorizer_name: vectorizer.load_vectors(embedding_paths[vectorizer_name])
    for vectorizer_name, vectorizer in vectorizers.items()
}    

In [None]:
d2v = vectors["doc2vec"]
w2v = vectors["word2vec-mean"]
bert = vectors["bert"]

In [None]:
stacked = np.hstack([d2v, w2v, bert])

In [None]:
stacked.shape

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = y.float()
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
scaler = MinMaxScaler()
X = stacked
X = scaler.fit_transform(X)

res = pd.read_csv("./evaluation/speed/cosine_bleu_C10_.csv")
labels = res["fastest"].values
labels.shape

le = LabelEncoder()
y = le.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = torch.from_numpy(y_train).long()
y_test = torch.from_numpy(y_test).long()
y_train = nn.functional.one_hot(y_train, num_classes=5)
y_test = nn.functional.one_hot(y_test, num_classes=5)


# Create PyTorch dataloaders for the training and testing sets
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
# Define the neural network architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(1368, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 5)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.softmax(self.fc4(x))
        return x

net = Net().to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.0002, momentum=0.9)

# Train the neural network
for epoch in range(20):
    running_loss = 0.0
    for data in train_dataloader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Print the average loss for this epoch
    print('Epoch %d, training loss: %.3f' %
          (epoch + 1, running_loss / len(train_dataloader)))

    # Evaluate the neural network on the test set
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_dataloader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = net(inputs)
            _, predicted = torch.max(outputs.data, 1)
            _, labels = torch.max(labels, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        print('Epoch %d, test accuracy: %f%%' % (epoch + 1, 100 * correct / total))

In [74]:
y_all = nn.functional.one_hot(torch.from_numpy(y).long(), num_classes=5)
all_ds = CustomDataset(X, y_all)
all_dataloader = DataLoader(all_ds, batch_size=32, shuffle=False)

In [75]:
correct = 0
total = 0
with torch.no_grad():
    for data in all_dataloader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        _, labels = torch.max(labels, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print('Total accuracy: %f%%' % (100 * correct / total))

Epoch 20, test accuracy: 27.597444%
