In [1]:
import numpy as np
import pandas as pd

import requests
import csv
import time
from bs4 import BeautifulSoup

import torch
from transformers import BertTokenizer, BertModel

from progressbar import ProgressBar
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

import ast

In [3]:
folder = '/dlabdata1/lugeon/'
name = 'websites_10_000_5cat_emb'
ext = '.gz'
data = pd.read_csv(folder + name + ext, 
                   header=0, 
                   names=['id', 'url', 'cat0', 'emb'], 
                   dtype={'id':np.int32, 'url':str, 'cat0':str, 'emb':str})

In [4]:
data['emb'] = data.apply(lambda row: np.array(ast.literal_eval(row.emb)), axis=1)

In [5]:
data.head()

Unnamed: 0,id,url,cat0,emb
0,133292,http://www.imdb.com/title/tt0044207/,Arts,"[0.19638090891848234, -0.2036611011082476, 0.1..."
1,126451,http://www.missgien.net/misc/films/robin.html,Arts,"[-0.46314648985862733, -0.10842716749757528, 0..."
2,252430,http://www.ppmag.com,Arts,"[-0.0398584817137037, -0.30310366010027273, -0..."
4,125161,http://www.hollywoodjesus.com/powder.htm,Arts,"[0.018674095170960452, -0.03543987311422825, 0..."
6,258127,http://www.kspq.com/,Arts,"[0.009379717521369457, -0.16280843317508698, 0..."


In [6]:
data.shape

(23188, 4)

In [7]:
data.cat0.value_counts()

Science      6513
Kids         6204
Arts         5660
Sports       3154
Computers    1657
Name: cat0, dtype: int64

In [8]:
def categorize(s):
    if s == 'Kids':
        return 0
    if s == 'Science':
        return 1
    if s == 'Arts':
        return 2
    if s == 'Computers':
        return 3
    if s == 'Sports':
        return 4

In [9]:
data['cat_no'] = data.apply(lambda row: categorize(row.cat0), axis=1)

In [10]:
embeddings = np.concatenate(data.emb.values)

In [12]:
embedding_dim = 768
train_input = torch.tensor(embeddings)
train_input = torch.reshape(train_input, (-1, embedding_dim)).float()

In [13]:
train_input.shape

torch.Size([23188, 768])

In [14]:
cat_no = data.cat_no.values
train_target = torch.tensor(cat_no).long()

In [15]:
train_target.shape

torch.Size([23188])

In [16]:
id = np.arange(train_input.shape[0])
np.random.shuffle(id)

tr_id = id[:20_000]
te_id = id[20_000:]

train_input_ = train_input[tr_id]
test_input_ = train_input[te_id]

train_target_ = train_target[tr_id]
test_target_ = train_target[te_id]

In [17]:
data.iloc[tr_id].cat0.value_counts()

Science      5560
Kids         5404
Arts         4918
Sports       2703
Computers    1415
Name: cat0, dtype: int64

In [18]:
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
import time


In [19]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(768, 200)
        self.fc2 = nn.Linear(200, 20)
        self.fc3 = nn.Linear(20, 5)
        self.drop = nn.Dropout(0.5)

    def forward(self, x):
        x = self.fc1(x)
        x = self.drop(x)
        x = self.fc2(F.relu(x))
        x = self.drop(x)
        x = self.fc3(F.relu(x))
        return x

In [20]:
def accuracy(output, target):
    nb_samples = output.shape[0]
    
    # Convert probability to decision
    output_class = torch.argmax(output, 1)
    
    nb_correct = (output_class == target).sum().item()
    return nb_correct / nb_samples


In [21]:
epochs = 200
batch_size = 64

model = Classifier()

# Loss
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), 1e-3)

# Training the model
model.train(True)

for e in range(epochs):
    
    for input, target in zip(train_input_.split(batch_size), train_target_.split(batch_size)):
                             
        output = model(input)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    model.train(False)
    tr_output = model(train_input_)
    te_output = model(test_input_)
    tr_loss = criterion(tr_output, train_target_)
    tr_acc = accuracy(tr_output, train_target_)
    te_acc = accuracy(te_output, test_target_)
    model.train(True)
    print("Epoch {}".format(e) +\
          " | Train loss : {:.3f}".format(tr_loss) +\
          " | Train accuracy : {:.3f}".format(tr_acc) +\
          " | Test accuracy : {:.3f}".format(te_acc))

Epoch 0 | Train loss : 1.127 | Train accuracy : 0.583 | Test accuracy : 0.586
Epoch 1 | Train loss : 1.029 | Train accuracy : 0.610 | Test accuracy : 0.611
Epoch 2 | Train loss : 0.985 | Train accuracy : 0.629 | Test accuracy : 0.622
Epoch 3 | Train loss : 0.958 | Train accuracy : 0.635 | Test accuracy : 0.629
Epoch 4 | Train loss : 0.947 | Train accuracy : 0.640 | Test accuracy : 0.626
Epoch 5 | Train loss : 0.935 | Train accuracy : 0.648 | Test accuracy : 0.636
Epoch 6 | Train loss : 0.919 | Train accuracy : 0.650 | Test accuracy : 0.638
Epoch 7 | Train loss : 0.904 | Train accuracy : 0.655 | Test accuracy : 0.637
Epoch 8 | Train loss : 0.896 | Train accuracy : 0.657 | Test accuracy : 0.636
Epoch 9 | Train loss : 0.889 | Train accuracy : 0.658 | Test accuracy : 0.641
Epoch 10 | Train loss : 0.879 | Train accuracy : 0.660 | Test accuracy : 0.640
Epoch 11 | Train loss : 0.868 | Train accuracy : 0.665 | Test accuracy : 0.644
Epoch 12 | Train loss : 0.857 | Train accuracy : 0.671 | Test 