In [23]:
import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
import time

from sklearn.metrics.pairwise import cosine_similarity
import ast

In [5]:
folder = '/dlabdata1/lugeon/'
name = 'websites_40000_5cat_emb.gz'
data = pd.read_csv(folder + name, 
                   header=0,
                   index_col = 0,
                   names=['emb', 'len', 'cat0'])

In [6]:
data.head()

Unnamed: 0,emb,len,cat0
0,"[-0.01415306854087751, 0.054038308285563406, -...",48215,Arts
1,"[0.04585096571180555, -0.018588595920138888, 0...",23,Arts
2,"[-0.009650490500710228, -0.02681940252130682, ...",24,Arts
3,,0,Arts
4,"[0.04253146113181601, 0.036278354878328284, 0....",683,Arts


In [8]:
data = data[data.emb.notnull()]
data = data[data.len >= 100]

In [9]:
data['emb'] = data.apply(lambda row: np.array(ast.literal_eval(row.emb)), axis=1)

In [11]:
data.shape

(105995, 3)

In [12]:
data.cat0.value_counts()

Science      22686
Computers    22206
Arts         21176
Kids         20849
Sports       19078
Name: cat0, dtype: int64

In [13]:
def categorize(s):
    if s == 'Kids':
        return 0
    if s == 'Science':
        return 1
    if s == 'Arts':
        return 2
    if s == 'Computers':
        return 3
    if s == 'Sports':
        return 4

In [14]:
data['cat_no'] = data.apply(lambda row: categorize(row.cat0), axis=1)

In [15]:
embeddings = np.concatenate(data.emb.values)

In [16]:
embedding_dim = 300
train_input = torch.tensor(embeddings)
train_input = torch.reshape(train_input, (-1, embedding_dim)).float()

In [17]:
train_input.shape

torch.Size([105995, 300])

In [18]:
cat_no = data.cat_no.values
train_target = torch.tensor(cat_no).long()

In [19]:
train_target.shape

torch.Size([105995])

In [20]:
id = np.arange(train_input.shape[0])
np.random.shuffle(id)

training_set_size = 90_000

tr_id = id[:training_set_size]
te_id = id[training_set_size:]

train_input_ = train_input[tr_id]
test_input_ = train_input[te_id]

train_target_ = train_target[tr_id]
test_target_ = train_target[te_id]

In [21]:
data.iloc[tr_id].cat0.value_counts()

Science      19159
Computers    18912
Arts         17990
Kids         17691
Sports       16248
Name: cat0, dtype: int64

In [22]:
data.iloc[te_id].cat0.value_counts()

Science      3527
Computers    3294
Arts         3186
Kids         3158
Sports       2830
Name: cat0, dtype: int64

In [27]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(300, 300)
        self.fc2 = nn.Linear(300, 300)
        self.fc3 = nn.Linear(300, 100)
        self.fc4 = nn.Linear(100, 5)
        self.drop = nn.Dropout(0.1)

    def forward(self, x):
        x = self.fc1(x)
        #x = self.drop(x)
        x = self.fc2(F.relu(x))
        # x = self.drop(x)
        x = self.fc3(F.relu(x))
        #x = self.drop(x)
        x = self.fc4(F.relu(x))
        return x

In [25]:
def accuracy(output, target):
    nb_samples = output.shape[0]
    
    # Convert probability to decision
    output_class = torch.argmax(output, 1)
    
    nb_correct = (output_class == target).sum().item()
    return nb_correct / nb_samples


In [29]:
epochs = 400
batch_size = 64

model = Classifier()

# Loss
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), 1e-4)

# Training the model
model.train(True)

for e in range(epochs):
    
    for input, target in zip(train_input_.split(batch_size), train_target_.split(batch_size)):
                             
        output = model(input)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    model.train(False)
    tr_output = model(train_input_)
    te_output = model(test_input_)
    tr_loss = criterion(tr_output, train_target_)
    tr_acc = accuracy(tr_output, train_target_)
    te_acc = accuracy(te_output, test_target_)
    model.train(True)
    print("Epoch {}".format(e) +\
          " | Train loss : {:.3f}".format(tr_loss) +\
          " | Train accuracy : {:.3f}".format(tr_acc) +\
          " | Test accuracy : {:.3f}".format(te_acc))

Epoch 0 | Train loss : 0.848 | Train accuracy : 0.689 | Test accuracy : 0.681
Epoch 1 | Train loss : 0.790 | Train accuracy : 0.710 | Test accuracy : 0.703
Epoch 2 | Train loss : 0.760 | Train accuracy : 0.719 | Test accuracy : 0.712
Epoch 3 | Train loss : 0.740 | Train accuracy : 0.728 | Test accuracy : 0.718
Epoch 4 | Train loss : 0.725 | Train accuracy : 0.733 | Test accuracy : 0.723
Epoch 5 | Train loss : 0.711 | Train accuracy : 0.737 | Test accuracy : 0.727
Epoch 6 | Train loss : 0.697 | Train accuracy : 0.743 | Test accuracy : 0.730
Epoch 7 | Train loss : 0.691 | Train accuracy : 0.745 | Test accuracy : 0.731
Epoch 8 | Train loss : 0.684 | Train accuracy : 0.747 | Test accuracy : 0.732
Epoch 9 | Train loss : 0.677 | Train accuracy : 0.751 | Test accuracy : 0.734
Epoch 10 | Train loss : 0.662 | Train accuracy : 0.757 | Test accuracy : 0.739
Epoch 11 | Train loss : 0.664 | Train accuracy : 0.756 | Test accuracy : 0.736
Epoch 12 | Train loss : 0.655 | Train accuracy : 0.759 | Test 

KeyboardInterrupt: 