In [1]:
import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
import time

from sklearn.metrics.pairwise import cosine_similarity
import ast

In [2]:
folder = '/dlabdata1/lugeon/'
name = 'websites_1000_5cat_emb.gz'
data = pd.read_csv(folder + name, 
                   header=0,
                   index_col = 0,
                   names=['emb', 'len', 'cat0'])

In [3]:
data.head()

Unnamed: 0,emb,len,cat0
0,,0,Arts
4,"[0.06388844242802373, 0.022353509973596645, 0....",814,Arts
5,"[0.04059564366060145, 0.030232969452353084, 0....",340,Arts
6,"[-0.05609130859375, 0.06304931640625, -0.08801...",5,Arts
7,"[-0.012737358085363795, 0.01617739857106969, 0...",584,Arts


In [4]:
data = data[data.emb.notnull()]
#data = data[data.len >= 100]

In [5]:
data['emb'] = data.apply(lambda row: np.array(ast.literal_eval(row.emb)), axis=1)

In [6]:
data.shape

(2653, 3)

In [7]:
data.cat0.value_counts()

Computers    626
Arts         531
Science      519
Kids         503
Sports       474
Name: cat0, dtype: int64

In [8]:
def categorize(s):
    if s == 'Kids':
        return 0
    if s == 'Science':
        return 1
    if s == 'Arts':
        return 2
    if s == 'Computers':
        return 3
    if s == 'Sports':
        return 4

In [9]:
data['cat_no'] = data.apply(lambda row: categorize(row.cat0), axis=1)

In [10]:
embeddings = np.concatenate(data.emb.values)

In [11]:
embedding_dim = 300
train_input = torch.tensor(embeddings)
train_input = torch.reshape(train_input, (-1, embedding_dim)).float()

In [12]:
train_input.shape

torch.Size([2653, 300])

In [13]:
cat_no = data.cat_no.values
train_target = torch.tensor(cat_no).long()

In [14]:
train_target.shape

torch.Size([2653])

In [15]:
id = np.arange(train_input.shape[0])
np.random.shuffle(id)

training_set_size = 2_200

tr_id = id[:training_set_size]
te_id = id[training_set_size:]

train_input_ = train_input[tr_id]
test_input_ = train_input[te_id]

train_target_ = train_target[tr_id]
test_target_ = train_target[te_id]

In [16]:
data.iloc[tr_id].cat0.value_counts()

Computers    510
Arts         443
Science      425
Kids         424
Sports       398
Name: cat0, dtype: int64

In [17]:
data.iloc[te_id].cat0.value_counts()

Computers    116
Science       94
Arts          88
Kids          79
Sports        76
Name: cat0, dtype: int64

In [18]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(300, 300)
        self.fc2 = nn.Linear(300, 300)
        self.fc3 = nn.Linear(300, 100)
        self.fc4 = nn.Linear(100, 5)
        self.drop = nn.Dropout(0.1)

    def forward(self, x):
        x = self.fc1(x)
        #x = self.drop(x)
        x = self.fc2(F.relu(x))
        # x = self.drop(x)
        x = self.fc3(F.relu(x))
        #x = self.drop(x)
        x = self.fc4(F.relu(x))
        return x

In [19]:
def accuracy(output, target):
    nb_samples = output.shape[0]
    
    # Convert probability to decision
    output_class = torch.argmax(output, 1)
    
    nb_correct = (output_class == target).sum().item()
    return nb_correct / nb_samples


In [20]:
epochs = 400
batch_size = 64

model = Classifier()

# Loss
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), 1e-4)

# Training the model
model.train(True)

for e in range(epochs):
    
    for input, target in zip(train_input_.split(batch_size), train_target_.split(batch_size)):
                             
        output = model(input)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    model.train(False)
    tr_output = model(train_input_)
    te_output = model(test_input_)
    tr_loss = criterion(tr_output, train_target_)
    tr_acc = accuracy(tr_output, train_target_)
    te_acc = accuracy(te_output, test_target_)
    model.train(True)
    print("Epoch {}".format(e) +\
          " | Train loss : {:.3f}".format(tr_loss) +\
          " | Train accuracy : {:.3f}".format(tr_acc) +\
          " | Test accuracy : {:.3f}".format(te_acc))

Epoch 0 | Train loss : 1.603 | Train accuracy : 0.345 | Test accuracy : 0.380
Epoch 1 | Train loss : 1.594 | Train accuracy : 0.299 | Test accuracy : 0.344
Epoch 2 | Train loss : 1.571 | Train accuracy : 0.374 | Test accuracy : 0.400
Epoch 3 | Train loss : 1.519 | Train accuracy : 0.389 | Test accuracy : 0.417
Epoch 4 | Train loss : 1.436 | Train accuracy : 0.466 | Test accuracy : 0.457
Epoch 5 | Train loss : 1.360 | Train accuracy : 0.474 | Test accuracy : 0.448
Epoch 6 | Train loss : 1.301 | Train accuracy : 0.505 | Test accuracy : 0.472
Epoch 7 | Train loss : 1.248 | Train accuracy : 0.528 | Test accuracy : 0.506
Epoch 8 | Train loss : 1.201 | Train accuracy : 0.545 | Test accuracy : 0.521
Epoch 9 | Train loss : 1.161 | Train accuracy : 0.568 | Test accuracy : 0.530
Epoch 10 | Train loss : 1.127 | Train accuracy : 0.586 | Test accuracy : 0.543
Epoch 11 | Train loss : 1.098 | Train accuracy : 0.599 | Test accuracy : 0.550
Epoch 12 | Train loss : 1.073 | Train accuracy : 0.615 | Test 