## A demo for training the dual-attn model 

In [22]:
import pandas as pd
import numpy as np
import pickle
import torch
import torch.nn as nn
from tqdm import tqdm
from collections import Counter
from tokenizer import Tokenizer
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score
from dual_attn import DualAttnModel

data = pd.read_csv('listed_web_train.csv')
data = data.dropna().drop_duplicates()
cleaned_content = ['|'.join(list(set(i.split("|")))) for i in data.cleaned_content]
data['cleaned_content'] = cleaned_content
data

### Data preprocessing

In [2]:
hojin_ids = list(set(data.hojin_id))

sample_data = pd.DataFrame({})
max_page = 32

for hojin_id in hojin_ids:
    temp = data[data.hojin_id == hojin_id]
    if temp.shape[0] <= max_page:
        sample_data = pd.concat([sample_data, temp], ignore_index=True)
    else:
        sample_data = pd.concat([sample_data, temp.iloc[:max_page, :]], ignore_index=True)

num_words = [len(i.split('|')) for i in sample_data.cleaned_content]
sample_data['num_words'] = num_words
sample_data = sample_data[sample_data.num_words > 5]

#### Load in pretrained word vectors

In [3]:
with open('wv_dict_listed.pkl', 'rb') as fp:
    wv_dict = pickle.load(fp)

vectors = np.array(list(wv_dict.values()))
words = list(wv_dict.keys())
vectors_all = np.vstack([np.zeros(300), vectors])
vectors_all = torch.tensor(vectors_all)

#### Tokenization

In [6]:
hojin_ids = list(set(sample_data.hojin_id))
hojin_ids = [int(i) for i in hojin_ids]

tokenizer = Tokenizer(words, max_len=864, data = sample_data)

web_vectors = [tokenizer.encode_webportfolio(company_id=idx, max_page=max_page) for idx in tqdm(hojin_ids)]

seq_ids = torch.tensor([i[1] for i in web_vectors])
num_pages = torch.tensor([i[0] for i in web_vectors])
seq_lengths = tokenizer.max_len - torch.sum(seq_ids == 0, axis=-1)

labels = torch.tensor([tokenizer.get_label(i) for i in hojin_ids])
hojin_ids = torch.tensor(hojin_ids)

100%|██████████| 3829/3829 [00:11<00:00, 324.16it/s]


In [7]:
batch_size = 8
dataset = TensorDataset(seq_ids, num_pages, seq_lengths, labels, hojin_ids)
train_dataloader = DataLoader(dataset, batch_size=batch_size)

In [9]:
def evaluate(data_loader, model):
    model.eval()
    count = 0
    i = 0
    gold_labels = []
    pred_labels = []
    for ind, batch in tqdm(enumerate(data_loader), ncols=80):
                
        seq_ids, num_pages, seq_lengths, label_list, hojin = batch        
        outputs, _, _, _, _, _, _ = model(seq_ids.to(device), num_pages.to(device), seq_lengths.to(device))
        preds = (outputs>0.5).squeeze()

        gold_labels += list(label_list.cpu().numpy())
        pred_labels += list(preds.cpu().numpy())
        num = (preds.cpu() == label_list.bool()).sum().cpu().item()
        count += num
        i += 1
    accuracy = count*1.0/(i * batch_size)
    print('Evaluation accuracy:', accuracy)
    return accuracy

#### Model

In [13]:
vectors = np.array(list(wv_dict.values()))
words = list(wv_dict.keys())
vectors_all = np.vstack([np.zeros(300), vectors])

torch.manual_seed(1218)
loss_function = nn.BCELoss()
scale = 10

model = DualAttnModel(vocab_size=len(words)+1, embed_dim=300, hidden_dim=300, 
                             label_dim=1, scale=10, page_scale=10)
model.load_vector(pretrained_vectors=vectors_all, trainable=True)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.02, weight_decay=0.0000, lr_decay=0.001)

embeddings loaded


#### Training

In [14]:
best_model = None
best_acc = 0
for i in range(5):
    print('Epoch:', i)
    print('#'*20)
    total_loss = 0
    count = 0
    model.train()
    for ind, batch in tqdm(enumerate(train_dataloader), ncols=80):
        seq_ids, num_pages, seq_lengths, label_list, hojin = batch
        model.zero_grad()
        preds, _, _, _, _, _, _ = model(seq_ids.to(device), num_pages.to(device), seq_lengths.to(device))
        loss = loss_function(preds.squeeze(), label_list.to(device).float())
        loss.backward()
        optimizer.step()
        total_loss += loss.cpu().item()*len(seq_ids)
        count += len(seq_ids)
    print('total_loss:  ', total_loss/count)
    print('Training Accuracy')
    evaluate(train_dataloader, model)

0it [00:00, ?it/s]

Epoch: 0
####################


1it [00:03,  3.41s/it]

Sample Loss:0.693


190it [04:17,  1.28s/it]

KeyboardInterrupt: 