In [1]:
import torch
import numpy as np
from transformers import BertTokenizer
from wikipedia2vec import Wikipedia2Vec
import time
import csv
import warnings
import logging

In [2]:
from utils import *
from data_processor import DataProcess
from model import Model, ModelConfig
from trainer import Trainer, TrainerConfig
from test_data_collator import TestDataCollator
from predictor import Predictor

In [3]:
warnings.filterwarnings('ignore')
logging.basicConfig(
        format="%(asctime)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [4]:
set_seed(42)

In [5]:
# use pretrained bert model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [6]:
# use pretrained wiki_vector model
model_file = '/data/suyinpei/wiki_vector.model'
wiki2vec = Wikipedia2Vec.load(model_file)

## Data process

In [7]:
ratio = 0.8 # ratio of train data to valid data
batch_size = 32 # batch size
en_pad_size = 12 # max entity number of one data
en_embd_dim = 100 # entity embedding dim
idf_file = '/data/suyinpei/idf_bigram5.txt'
entity_frep_file = '/data/suyinpei/entity_frep.tsv'
data_root = "/data/suyinpei/all_data_1028.tsv" # data: docid, text, entities, label
text_id_root = "/data/suyinpei/text_ids_1028.pt" # data_size * 512
labels_root = "/data/suyinpei/labels_1028.pt" # data_size
entity_id_root = "/data/suyinpei/entity_ids_1028.pt" # data_size * 12
entity_length_root = "/data/suyinpei/entity_length_1028.pt" # data_size
entity_score_root = "/data/suyinpei/entity_score_1028.pt" # data_size * 3
entity_vector_root = "/data/suyinpei/entity_vectors_1028.pt" # en_vocab_size * 100

In [8]:
processor = DataProcess(data_root, text_id_root, labels_root, entity_id_root, entity_length_root, entity_score_root)

In [9]:
# # run this when using new data, build text index and label
# all_input_ids, labels = processor.encode_text(tokenizer)

In [10]:
# get entity vocab for predict
entity_to_index, index_to_entity = processor.build_entity_vocab()

All Entity number:  7744598
Entity vocab size:  1600870


In [11]:
# run this when need compute entity vector
idf_dict, unk_idf = processor.load_idf(idf_file)

In [12]:
# # run this when use new data, build entity vector and index
# build_entity_vector = processor.build_entity_vector(entity_to_index, index_to_entity, wiki2vec, idf_dict, unk_idf, en_embd_dim, entity_vector_root)
# all_entity_ids, all_entity_length = processor.build_entity_id(entity_to_index, index_to_entity, en_pad_size)

In [13]:
entity_score_dict = processor.load_entity_score_dict(entity_frep_file)

Entity Score vocab size:  667095


In [14]:
# get entity score mean and std
_, entity_score_mean, entity_score_std = processor.build_entity_score(entity_score_dict)

Entity score mean:  tensor([[0.4746, 8.1614, 0.2299]])
Entity score std:  tensor([[  6.5568, 219.4999,   2.1919]])


In [15]:
entity_vector = processor.load_entity_vector(entity_vector_root) # get pretrained entity_vector

Entity vector shape:  torch.Size([1600870, 100])


In [16]:
train_dataloader, valid_dataloader = processor.load_data(ratio, batch_size) # build train/valid dataloader

Num of train_dataloader:  12715
Num of valid_dataloader:  3179


## Model

In [17]:
mconf = ModelConfig(model_name, entity_vector, en_embd_dim, en_hidden_size1=128, 
                    en_hidden_size2=128, en_score_dim=3, use_en_encoder=True)

In [18]:
model = Model(mconf)

In [19]:
model.fix_layer_grad()

Model : all params: 269.601572M
Model : need grad params: 7.710796M


## Start Training

In [20]:
tconf = TrainerConfig(max_epochs=1, learning_rate=6e-4, lr_decay=True, 
                      warmup_tokens=32*200, final_tokens=1*batch_size*len(train_dataloader),
                      num_workers=1, ckpt_path='../models/local-likely-model.pt')

max_epochs 1
learning_rate 0.0006
lr_decay True
warmup_tokens 6400
final_tokens 406880
num_workers 1
ckpt_path ../models/local-likely-model.pt


In [20]:
trainer = Trainer(model, train_dataloader, valid_dataloader, tconf)

use device: cuda


In [21]:
# start training
trainer.train()

epoch 1 iter 12714: train loss 0.00613. score 1.00000. lr 6.000000e-05: 100%|██████████| 12715/12715 [1:23:01<00:00,  2.55it/s]


## Test data collect

In [20]:
#load model
model.load_state_dict(torch.load('../models/local-likely-model-iter10.pt'))

<All keys matched successfully>

In [21]:
test_data_file = "/data/suyinpei/test_data_1k.tsv"
test_batch = 32

In [22]:
test_data_collator = TestDataCollator(test_data_file)

In [23]:
test_dataloader = test_data_collator.load_data(test_batch, tokenizer, entity_to_index, index_to_entity, wiki2vec, idf_dict, unk_idf, 
                                    en_pad_size, en_embd_dim, entity_score_dict, entity_score_mean, entity_score_std)

Encode text: Took 10.463449954986572 seconds
Encode entity: Took 2.084402322769165 seconds


## Predict

In [24]:
predictor = Predictor(model, test_dataloader)

use device: cuda


In [25]:
last_time = time.time()
model_predict = predictor.predict()
logger.info('Took {} seconds'.format(time.time() - last_time))

Test Progress: 100%|██████████| 32/32 [00:09<00:00,  3.23it/s]
10/30/2020 08:41:56 - Took 9.923027276992798 seconds


In [26]:
logger.info('Predict number: {}'.format(model_predict.shape[0]))
fout = open('../data/model-predict.tsv','w')
for prob in model_predict:
    fout.write('{}\n'.format(prob.item()))

10/30/2020 08:41:56 - Predict number: 999
