In [7]:
import numpy as np
import pandas as pd


import torch

from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm
import ast

In [45]:
folder = '/dlabdata1/lugeon/'
name = 'websites_40000_5cat_emb.gz'
data = pd.read_csv(folder + name, 
                   header=0,
                   index_col = 0,
                   names=['emb', 'cat0'])

In [46]:
data = data[data.emb.notnull()]

In [47]:
data['emb'] = data.apply(lambda row: np.array(ast.literal_eval(row.emb)), axis=1)

In [48]:
data.head()

Unnamed: 0,emb,cat0
0,"[-0.0009275737561677632, -0.02362455401504249,...",Arts
1,"[0.04585096571180555, -0.018588595920138888, 0...",Arts
2,"[-0.009650490500710228, -0.02681940252130682, ...",Arts
4,"[0.04253146113181601, 0.036278354878328284, 0....",Arts
5,"[-0.07145182291666667, 0.0058917999267578125, ...",Arts


In [49]:
data.shape

(150192, 2)

In [51]:
data.cat0.value_counts()

Science      31772
Computers    31235
Kids         30741
Arts         29484
Sports       26960
Name: cat0, dtype: int64

In [52]:
def categorize(s):
    if s == 'Kids':
        return 0
    if s == 'Science':
        return 1
    if s == 'Arts':
        return 2
    if s == 'Computers':
        return 3
    if s == 'Sports':
        return 4

In [53]:
data['cat_no'] = data.apply(lambda row: categorize(row.cat0), axis=1)

In [54]:
embeddings = np.concatenate(data.emb.values)

In [55]:
embedding_dim = 300
train_input = torch.tensor(embeddings)
train_input = torch.reshape(train_input, (-1, embedding_dim)).float()

In [56]:
train_input.shape

torch.Size([150192, 300])

In [57]:
cat_no = data.cat_no.values
train_target = torch.tensor(cat_no).long()

In [58]:
train_target.shape

torch.Size([150192])

In [59]:
id = np.arange(train_input.shape[0])
np.random.shuffle(id)

tr_id = id[:140_000]
te_id = id[140_000:]

train_input_ = train_input[tr_id]
test_input_ = train_input[te_id]

train_target_ = train_target[tr_id]
test_target_ = train_target[te_id]

In [60]:
data.iloc[tr_id].cat0.value_counts()

Science      29651
Computers    29084
Kids         28660
Arts         27486
Sports       25119
Name: cat0, dtype: int64

In [61]:
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
import time


In [65]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(300, 100)
        self.fc2 = nn.Linear(100, 20)
        self.fc3 = nn.Linear(20, 5)
        self.drop = nn.Dropout(0.1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.drop(x)
        x = self.fc2(F.relu(x))
        x = self.drop(x)
        x = self.fc3(F.relu(x))
        return x

In [63]:
def accuracy(output, target):
    nb_samples = output.shape[0]
    
    # Convert probability to decision
    output_class = torch.argmax(output, 1)
    
    nb_correct = (output_class == target).sum().item()
    return nb_correct / nb_samples


In [None]:
epochs = 200
batch_size = 64

model = Classifier()

# Loss
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), 1e-3)

# Training the model
model.train(True)

for e in range(epochs):
    
    for input, target in zip(train_input_.split(batch_size), train_target_.split(batch_size)):
                             
        output = model(input)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    model.train(False)
    tr_output = model(train_input_)
    te_output = model(test_input_)
    tr_loss = criterion(tr_output, train_target_)
    tr_acc = accuracy(tr_output, train_target_)
    te_acc = accuracy(te_output, test_target_)
    model.train(True)
    print("Epoch {}".format(e) +\
          " | Train loss : {:.3f}".format(tr_loss) +\
          " | Train accuracy : {:.3f}".format(tr_acc) +\
          " | Test accuracy : {:.3f}".format(te_acc))

Epoch 0 | Train loss : 1.007 | Train accuracy : 0.612 | Test accuracy : 0.612
Epoch 1 | Train loss : 0.957 | Train accuracy : 0.628 | Test accuracy : 0.627
Epoch 2 | Train loss : 0.932 | Train accuracy : 0.636 | Test accuracy : 0.631
Epoch 3 | Train loss : 0.916 | Train accuracy : 0.640 | Test accuracy : 0.636
Epoch 4 | Train loss : 0.901 | Train accuracy : 0.643 | Test accuracy : 0.637
Epoch 5 | Train loss : 0.897 | Train accuracy : 0.644 | Test accuracy : 0.638
Epoch 6 | Train loss : 0.891 | Train accuracy : 0.646 | Test accuracy : 0.642
Epoch 7 | Train loss : 0.884 | Train accuracy : 0.649 | Test accuracy : 0.642
Epoch 8 | Train loss : 0.884 | Train accuracy : 0.648 | Test accuracy : 0.643
Epoch 9 | Train loss : 0.868 | Train accuracy : 0.651 | Test accuracy : 0.644
Epoch 10 | Train loss : 0.873 | Train accuracy : 0.650 | Test accuracy : 0.643
Epoch 11 | Train loss : 0.867 | Train accuracy : 0.653 | Test accuracy : 0.645
Epoch 12 | Train loss : 0.859 | Train accuracy : 0.656 | Test 