In [6]:
import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
import time

from sklearn.metrics.pairwise import cosine_similarity
import ast

In [7]:
folder = '/dlabdata1/lugeon/'
name = 'websites_40000_5cat_emb.gz'
data = pd.read_csv(folder + name, 
                   header=0,
                   index_col = 0,
                   names=['emb', 'len', 'cat0'])

In [8]:
data.head()

Unnamed: 0,emb,len,cat0
0,"[-0.0009275737561677632, -0.02362455401504249,...",138,Arts
1,"[0.04585096571180555, -0.018588595920138888, 0...",23,Arts
4,"[0.04253146113181601, 0.036278354878328284, 0....",683,Arts
5,"[-0.07145182291666667, 0.0058917999267578125, ...",3,Arts
6,"[0.055974324544270836, 0.022883097330729168, -...",21,Arts


In [9]:
data = data[data.emb.notnull()]
#data = data[data.len >= 100]

In [10]:
data['emb'] = data.apply(lambda row: np.array(ast.literal_eval(row.emb)), axis=1)

In [11]:
data.shape

(103134, 3)

In [12]:
data.cat0.value_counts()

Computers    24464
Science      20931
Arts         20469
Kids         18884
Sports       18386
Name: cat0, dtype: int64

In [13]:
def categorize(s):
    if s == 'Kids':
        return 0
    if s == 'Science':
        return 1
    if s == 'Arts':
        return 2
    if s == 'Computers':
        return 3
    if s == 'Sports':
        return 4

In [14]:
data['cat_no'] = data.apply(lambda row: categorize(row.cat0), axis=1)

In [15]:
embeddings = np.concatenate(data.emb.values)

In [16]:
embedding_dim = 300
train_input = torch.tensor(embeddings)
train_input = torch.reshape(train_input, (-1, embedding_dim)).float()

In [17]:
train_input.shape

torch.Size([103134, 300])

In [18]:
cat_no = data.cat_no.values
train_target = torch.tensor(cat_no).long()

In [19]:
train_target.shape

torch.Size([103134])

In [20]:
id = np.arange(train_input.shape[0])
np.random.shuffle(id)

training_set_size = 90_000

tr_id = id[:training_set_size]
te_id = id[training_set_size:]

train_input_ = train_input[tr_id]
test_input_ = train_input[te_id]

train_target_ = train_target[tr_id]
test_target_ = train_target[te_id]

In [21]:
data.iloc[tr_id].cat0.value_counts()

Computers    21288
Science      18325
Arts         17920
Kids         16490
Sports       15977
Name: cat0, dtype: int64

In [22]:
data.iloc[te_id].cat0.value_counts()

Computers    3176
Science      2606
Arts         2549
Sports       2409
Kids         2394
Name: cat0, dtype: int64

In [23]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(300, 300)
        self.fc2 = nn.Linear(300, 300)
        self.fc3 = nn.Linear(300, 100)
        self.fc4 = nn.Linear(100, 5)
        self.drop = nn.Dropout(0.1)

    def forward(self, x):
        x = self.fc1(x)
        #x = self.drop(x)
        x = self.fc2(F.relu(x))
        # x = self.drop(x)
        x = self.fc3(F.relu(x))
        #x = self.drop(x)
        x = self.fc4(F.relu(x))
        return x

In [24]:
def accuracy(output, target):
    nb_samples = output.shape[0]
    
    # Convert probability to decision
    output_class = torch.argmax(output, 1)
    
    nb_correct = (output_class == target).sum().item()
    return nb_correct / nb_samples


In [None]:
epochs = 400
batch_size = 64

model = Classifier()

# Loss
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), 1e-4)

# Training the model
model.train(True)

for e in range(epochs):
    
    for input, target in zip(train_input_.split(batch_size), train_target_.split(batch_size)):
                             
        output = model(input)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    model.train(False)
    tr_output = model(train_input_)
    te_output = model(test_input_)
    tr_loss = criterion(tr_output, train_target_)
    tr_acc = accuracy(tr_output, train_target_)
    te_acc = accuracy(te_output, test_target_)
    model.train(True)
    print("Epoch {}".format(e) +\
          " | Train loss : {:.3f}".format(tr_loss) +\
          " | Train accuracy : {:.3f}".format(tr_acc) +\
          " | Test accuracy : {:.3f}".format(te_acc))

Epoch 0 | Train loss : 0.910 | Train accuracy : 0.666 | Test accuracy : 0.662
Epoch 1 | Train loss : 0.838 | Train accuracy : 0.693 | Test accuracy : 0.687
Epoch 2 | Train loss : 0.798 | Train accuracy : 0.706 | Test accuracy : 0.699
Epoch 3 | Train loss : 0.774 | Train accuracy : 0.714 | Test accuracy : 0.707
Epoch 4 | Train loss : 0.756 | Train accuracy : 0.720 | Test accuracy : 0.711
Epoch 5 | Train loss : 0.742 | Train accuracy : 0.724 | Test accuracy : 0.714
Epoch 6 | Train loss : 0.729 | Train accuracy : 0.728 | Test accuracy : 0.718
Epoch 7 | Train loss : 0.719 | Train accuracy : 0.732 | Test accuracy : 0.722
Epoch 8 | Train loss : 0.710 | Train accuracy : 0.735 | Test accuracy : 0.723
Epoch 9 | Train loss : 0.702 | Train accuracy : 0.737 | Test accuracy : 0.723
Epoch 10 | Train loss : 0.693 | Train accuracy : 0.740 | Test accuracy : 0.725
Epoch 11 | Train loss : 0.686 | Train accuracy : 0.743 | Test accuracy : 0.727
Epoch 12 | Train loss : 0.680 | Train accuracy : 0.746 | Test 