In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split
import transformers
from transformers import BertModel, BertTokenizer
from transformers import get_linear_schedule_with_warmup
from torchtext.vocab import Vectors
from wikipedia2vec import Wikipedia2Vec
import collections
from collections import Counter
import csv 
import numpy as np
from tqdm import tqdm
import math
import warnings

In [2]:
warnings.filterwarnings('ignore')
# set random seed 
np.random.seed(0) 
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # confirm device
device

device(type='cuda')

In [3]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [4]:
# use pretrained bert model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [5]:
# use pretrained wiki_vector model
model_file = '/data/suyinpei/wiki_vector.model'
wiki2vec = Wikipedia2Vec.load(model_file)

In [13]:
class DataProcess():
    
    def __init__(self, data_root, text_id_root, labels_root, entity_id_root, entity_length_root, entity_score_root):
        self.data_root = data_root
        self.text_id_root = text_id_root
        self.labels_root = labels_root
        self.entity_id_root = entity_id_root
        self.entity_length_root = entity_length_root
        self.entity_score_root = entity_score_root
    
    def prepare_data(self):
        docid_list = []
        text_list = []
        entity_list = []
        label_list = []
        with open(self.data_root, 'r') as f:
            reader = csv.reader(f, delimiter='\t')
            for line in reader:
                docid_list.append(line[0])
                text_list.append(line[1])
                entity_list.append(list(set(line[2].split('|'))))
                label_list.append(int(line[3]))
        return text_list, entity_list, label_list

    # Function to get token ids for a list of texts 
    def encode_text(self):
        text_list, _, label_list = self.prepare_data()
        all_input_ids = []    
        num = 0
        for text in text_list:
            num += 1
            if num % 10000 == 0:
                print(num)
            input_ids = tokenizer.encode(
                            text,                      
                            add_special_tokens = True,             
                            truncation=True,
                            padding = 'max_length',     
                            return_tensors = 'pt'       
                       )
            all_input_ids.append(input_ids)    
        all_input_ids = torch.cat(all_input_ids, dim=0)
        labels = torch.tensor(label_list, dtype=torch.float)
        # Save tensor
        torch.save(all_input_ids, self.text_id_root)
        torch.save(labels,self.labels_root)
        print("Saved success!")
        return all_input_ids, labels
    
    # Function to build entity vocab
    def encode_entity(self):
        _, entity_list, _ = self.prepare_data()
        # get all entity
        entity_list_all = [en for entity in entity_list for en in entity]
        print("All Entity number: ", len(entity_list_all))
        # build entity vocab
        entity_vocab = collections.OrderedDict(Counter(entity_list_all))
        entity_list_uniq = [entity for entity in entity_vocab.keys()]
        entity_to_index = {entity : i+2 for i, entity in enumerate(entity_list_uniq)}
        entity_to_index['<unk>'] = 0
        entity_to_index['<pad>'] = 1
        entity_to_index = collections.OrderedDict(sorted(entity_to_index.items(), key=lambda entity_to_index: entity_to_index[1]))
        index_to_entity = [entity for i, entity in enumerate(entity_to_index)]
        print("Entity vocab size: ", len(entity_to_index))
        return entity_to_index, index_to_entity
    
    # Function to build entity vocab with pretrained vector
    def build_entity_vector(self, en_embd_dim, idf_file, entity_vector_root):
        entity_to_index, index_to_entity = self.encode_entity()
        idf_dict, UNK_IDF = self.load_idf(idf_file)
        # build entity vector
        idx_to_vector=[]
        for entity in entity_to_index.keys():
            entity_item = wiki2vec.get_entity(entity)
            if entity_item != None:
                idx_to_vector.append(torch.tensor(self.en_vector_norm(wiki2vec.get_vector(entity_item))).float())
            else:
                words = entity.lower().split()
                word_vectors = []
                weights = []
                for w in words:
                    try:
                        vector = wiki2vec.get_word_vector(w.lower())
                    except KeyError:
                        continue
                    word_vectors.append(vector)
                    idf = idf_dict.get(w, UNK_IDF)
                    weights.append(idf)
                if len(word_vectors) == 0:
                    idx_to_vector.append(torch.zeros(en_embd_dim))
                else:
                    word_vectors = np.array(word_vectors)
                    weights = np.expand_dims(np.array(weights), axis=1)
                    idx_to_vector.append(torch.tensor(self.en_vector_norm(np.sum(word_vectors * weights, axis=0))).float())
        entity_vector = torch.stack(idx_to_vector)
        torch.save(entity_vector, entity_vector_root)
        print("Saved success!")
        return entity_vector
    
    # Function to get token ids for a list of entities
    def build_entity_id(self, en_pad_size):
        # build entity index
        _, entity_list, _ = self.prepare_data()
        entity_to_index, index_to_entity = self.encode_entity()
        entity_score_dict = self.load_entity_score_dict()
        all_entity_ids = []
        all_entity_length = []
        all_entity_score = []
        for entities in entity_list:
            entity_ids = [entity_to_index.get(entity, entity_to_index["<unk>"]) for entity in entities][:en_pad_size]
            for i in range(en_pad_size - len(entity_ids)):
                entity_ids.append(entity_to_index["<pad>"])
            all_entity_ids.append(entity_ids)
            # record entity length
            all_entity_length.append(len(entities))
            # build entity score
            entity_score = []
            score = 1
            for en in entities:
                if en in entity_score_dict:
                    en_score = float(entity_score_dict[en])
                    score *= en_score
            score = math.log(score,10)
            entity_score.append(score)
            if score >= 0:
                entity_score.append(score**2)
                entity_score.append(score**0.5)
            else:
                entity_score.append(-score**2)
                entity_score.append(-(abs(score)**0.5))
            all_entity_score.append(entity_score)
        all_entity_ids = torch.tensor(all_entity_ids)
        all_entity_length = torch.tensor(all_entity_length)
        all_entity_score = torch.tensor(all_entity_score)
        torch.save(all_entity_ids, self.entity_id_root)
        torch.save(all_entity_length, self.entity_length_root)
        torch.save(self.en_score_norm(all_entity_score), self.entity_score_root)
        print("Saved success!")
        return all_entity_ids, all_entity_length, all_entity_score
    
    # load idf file
    def load_idf(self, idf_file):
        ret = {}
        with open(idf_file) as f:
            for line in f:
                phrase, count, idf = line.split('\t')
                idf = float(idf)
                ret[phrase] = idf
        return ret, ret['<UNK>']
    
    def load_entity_score_dict(self):
        entity_score_dict = {}
        with open("entity_frep.tsv") as f:
            for line in f:
                entity, c1, c2, freq = line.split('\t')
                c1 = int(c1)
                c2 = int(c2)
                if c1 == 0 or c2 == 0:
                    c1 += 1
                    c2 += 1
                if c1 + c2 > 10:
                    entity_score_dict[entity] = freq
        print("Entity Score vocab size: ", len(entity_score_dict))
        return entity_score_dict
        
    # normlize entity vector
    def en_vector_norm(self, vector):
        norm = np.linalg.norm(vector)
        return vector / (norm + 1e-9)
    
    def en_score_norm(self,x):
        mean = x.mean(dim=0,keepdim=True)
        std = x.std(dim=0, unbiased=False,keepdim=True)
        x_norm = (x - mean)/std
        return x_norm
    
    # build dataset and dataloader
    def load_data(self, ratio, batch_size):
        all_input_ids = torch.load(self.text_id_root)
        all_entity_ids = torch.load(self.entity_id_root)
        all_entity_length = torch.load(self.entity_length_root)
        all_entity_score = torch.load(self.entity_score_root)
        labels = torch.load(self.labels_root)
        # Split data into train and validation
        dataset = TensorDataset(all_input_ids, all_entity_ids, all_entity_length, all_entity_score, labels)
        train_size = int(ratio * len(dataset))
        valid_size = len(dataset) - train_size
        train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

        # Create train and validation dataloaders
        train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
        valid_dataloader = DataLoader(valid_dataset, batch_size = batch_size, shuffle = False)

        return train_dataloader, valid_dataloader

In [14]:
ratio = 0.8 # ratio of train data to valid data
batch_size = 32 # batch size
en_pad_size = 12 # max entity number of one data
en_embd_dim = 100 # entity embedding dim
idf_file = '/data/suyinpei/idf_bigram5.txt'
data_root = "/data/suyinpei/all_data_1026.tsv" # data: docid, text, entities, label
text_id_root = "data/text_ids_1026.pt" # data_size * 512
labels_root = "data/labels_1026.pt" # data_size
entity_id_root = "data/entity_ids_1026.pt" # data_size * 12
entity_length_root = "data/entity_length_1026.pt" # data_size
entity_score_root = "data/entity_score_1026.pt" # data_size * 3
entity_vector_root = "data/entity_vectors_1026.pt" # en_vocab_size * 100

In [15]:
processor = DataProcess(data_root, text_id_root, labels_root, entity_id_root, entity_length_root, entity_score_root)

In [16]:
# # run this when using new data
# all_input_ids, labels = processor.encode_text()

In [10]:
# get entity vocab for predict
entity_to_index, index_to_entity = processor.encode_entity()

All Entity number:  7684627
Entity vocab size:  1586333


In [11]:
# get entity score dict 
entity_score_dict = processor.load_entity_score_dict()

Entity Score vocab size:  196714


In [17]:
# # run this when use new data
# build_entity_vector = processor.build_entity_vector(en_embd_dim, idf_file, entity_vector_root)
all_entity_ids, all_entity_length, all_entity_score = processor.build_entity_id(en_pad_size)

All Entity number:  7684627
Entity vocab size:  1586333
Entity Score vocab size:  196714
Saved success!


In [18]:
entity_vector = torch.load(entity_vector_root) # get pretrained entity_vector

In [19]:
train_dataloader, valid_dataloader = processor.load_data(ratio, batch_size) # build train/valid dataloader

In [20]:
print("Num of train_dataloader: ", len(train_dataloader))
print("Num of valid_dataloader: ", len(valid_dataloader))

Num of train_dataloader:  12646
Num of valid_dataloader:  3162


In [29]:
class Model(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.bert = BertModel.from_pretrained(config.model_name)
        self.ln = nn.LayerNorm(self.bert.pooler.dense.weight.shape[0], eps=1e-12)
        self.use_en_encoder = config.use_en_encoder
        if self.use_en_encoder: # if use entity infomation
            self.en_encoder = EntityEncoder(config)
            self.dropout = nn.Dropout(config.dropout_prob)
            self.fc = nn.Linear(self.bert.pooler.dense.weight.shape[0]+self.en_encoder.mlp[0].weight.shape[0]+config.en_score_dim, config.output_size)
        else:
            self.dropout = nn.Dropout(config.dropout_prob)
            self.fc = nn.Linear(self.bert.pooler.dense.weight.shape[0], config.output_size)
        
    def configure_optimizers(self, train_config):
#         # use weight decay to optimize
#         param_optimizer = list(model.named_parameters())  
#         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
#         optimizer_grouped_parameters = [
#                 {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': train_config.weight_decay},
#                 {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
#         optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=train_config.learning_rate, betas=train_config.betas)
        optimizer = torch.optim.AdamW(self.parameters(), lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, input_ids, entity_ids, entity_length, entity_score, labels=None, token_type_ids=None, attention_mask=None):
        _, bert_output = self.bert(input_ids, token_type_ids, attention_mask,)
        bert_output = self.ln(bert_output)
        if self.use_en_encoder: # if use entity infomation
            en_encoder_output = self.en_encoder(entity_ids, entity_length, entity_score)
            x = torch.cat((bert_output,  en_encoder_output),dim=1)
        else:
            x = bert_output
        x = self.dropout(x)
        y_pred = self.fc(x).squeeze(-1)
        if labels is not None:
            loss = F.binary_cross_entropy_with_logits(y_pred, labels)
            return y_pred, loss
        else:
            return y_pred 

In [30]:
class EntityEncoder(nn.Module):
    """ Encode entities to generate single presentation """

    def __init__(self, config):
        super().__init__()
        self.en_embeddings = nn.Embedding.from_pretrained(config.entity_vector,freeze=True)

        self.ln1 = nn.LayerNorm(config.en_embd_dim, eps=1e-12)
        self.dropout = nn.Dropout(config.dropout_prob)
        self.mlp = nn.Sequential(
            nn.Linear(config.en_embd_dim, config.en_hidden_size1),
            nn.GELU(),
            nn.Linear(config.en_hidden_size1, config.en_hidden_size2),
#             nn.Dropout(config.dropout_prob), # maybe useful
        )
        self.ln2 = nn.LayerNorm(config.en_hidden_size1, eps=1e-12)

    def forward(self, entity_ids, entity_length, entity_score):
        embeddings = self.en_embeddings(entity_ids)
        
        x = self.ln1(embeddings)
        x = self.dropout(x)
        
        x = self.mlp(embeddings) # batch_size * entity_num * embd_dim
        x = self.single_pool(x, entity_length) #batch_size * embd_dim
        x = self.ln2(x)
        x = torch.cat((x, entity_score),dim=1)
        
        return x
    
    # do this because of different entity length
    def single_pool(self, x, x_length):
        all_pool_out = []
        for i in range(x.shape[0]):
            if x_length[i] == 0:
                 x_length[i] += 1
            single_data = x[i][:x_length[i]].unsqueeze(0)
            pool_out = F.max_pool2d(single_data, (single_data.shape[1], 1)).squeeze(1)
            all_pool_out.append(pool_out)
        x = torch.cat(all_pool_out,dim=0)
        return x

In [31]:
class Trainer:

    def __init__(self, model, train_loader, test_loader, config):
        self.model = model
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.config = config

        # take over whatever gpus are on the system
        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()
            self.model = torch.nn.DataParallel(self.model).to(self.device)

    def save_checkpoint(self):
        # DataParallel wrappers keep raw model object in .module attribute
        raw_model = self.model.module if hasattr(self.model, "module") else self.model
        logger.info("saving %s", self.config.ckpt_path)
        torch.save(raw_model.state_dict(), self.config.ckpt_path)
        
    def binary_accuracy(self, preds, y):
        rounded_preds = torch.round(torch.sigmoid(preds))
        correct = (rounded_preds == y).float()
        acc = correct.sum() / len(correct)
        return acc

    def train(self):
        model, config = self.model, self.config
        raw_model = model.module if hasattr(self.model, "module") else model
        optimizer = raw_model.configure_optimizers(config)

        def run_epoch(split):
            is_train = split == 'train'
            model.train(is_train)
            loader = self.train_loader if is_train else self.test_loader
            
            losses = []
            all_y = []
            all_y_pred = []
            pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
            for it, (text_ids, entity_ids, entity_length, entity_score, y) in pbar:
                # place data on the correct device
                text_ids = text_ids.to(self.device)
                entity_ids = entity_ids.to(device)
                entity_length = entity_length.to(device)
                entity_score = entity_score.to(device)
                y = y.to(self.device)
                # forward the model
                with torch.set_grad_enabled(is_train):
                    y_pred, loss = model(text_ids, entity_ids, entity_length, entity_score, labels=y)
                    loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
                    losses.append(loss.item())
                    step_score = self.binary_accuracy(y_pred, y)
                    all_y.extend(y)
                    all_y_pred.extend(y_pred)
                
                if is_train:

                    # backprop and update the parameters
                    model.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
                    optimizer.step()

                    # decay the learning rate based on our progress
                    if config.lr_decay:
                        self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)
                        if self.tokens < config.warmup_tokens:
                            # linear warmup
                            lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))
                        else:
                            # cosine learning rate decay
                            progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))
                            lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
                        lr = config.learning_rate * lr_mult
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr
                    else:
                        lr = config.learning_rate

                    # report progress
                    pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. score {step_score:.5f}. lr {lr:e}")

            if not is_train:
                test_loss = float(np.mean(losses))
                all_y = torch.stack(all_y, dim=0)
                all_y_pred = torch.stack(all_y_pred, dim=0)
                test_score = self.binary_accuracy(all_y_pred, all_y)
                logger.info("test loss: %f", test_loss)
                logger.info("test score: %f", test_score)
                return test_loss

        self.tokens = 0 # counter used for learning rate decay
        best_loss = float('inf')
        for epoch in range(config.max_epochs):

            run_epoch('train')
            if self.test_loader is not None:
                test_loss = run_epoch('test')

            # supports early stopping based on the test loss, or just save always if no test set is provided
            good_model = self.test_loader is None or test_loss < best_loss
            if self.config.ckpt_path is not None and good_model:
                best_loss = test_loss
                self.save_checkpoint()

In [32]:
class ModelConfig:
    """ base mdoel config """
    output_size = 1 # local(1) or non-local(0)
    dropout_prob = 0.1
    
    def __init__(self, model_name, entity_vector, en_embd_dim, en_hidden_size1, en_hidden_size2, 
                 en_score_dim, **kwargs):
        self.model_name = model_name
        self.entity_vector = entity_vector
        self.en_embd_dim = en_embd_dim
        self.en_hidden_size1 = en_hidden_size1
        self.en_hidden_size2 = en_hidden_size2
        self.en_score_dim = en_score_dim
        for k, v in kwargs.items():
            setattr(self, k, v)

In [33]:
class TrainerConfig:
    # optimization parameters
    max_epochs = 10
    learning_rate = 3e-4
    betas = (0.9, 0.95)
    grad_norm_clip = 1.0
    weight_decay = 0.1 # may useful optimize method
    # learning rate decay params: linear warmup followed by cosine decay to 10% of original
    lr_decay = False # optimize method
    warmup_tokens = 375e6 # use this to train model from a lower learning rate
    final_tokens = 260e9 # all tokens during whole training process
    # checkpoint settings
    ckpt_path = 'local-likely-model.pt' # save model path
    num_workers = 0 # for DataLoader

    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            print(k,v)
            setattr(self, k, v)

In [41]:
mconf = ModelConfig(model_name, entity_vector, en_embd_dim, en_hidden_size1=128, 
                    en_hidden_size2=128, en_score_dim=3, use_en_encoder=True)

In [42]:
model = Model(mconf)

In [43]:
# # print model structure
# model

In [44]:
# do not train bert embedding layer
for par in model.bert.embeddings.parameters(): 
    par.requires_grad = False
# only train last(11th) bert encode layer
for par in model.bert.encoder.layer[:11].parameters(): 
    par.requires_grad = False

In [45]:
# print model all parameters and parameters need training
print('{} : all params: {:4f}M'.format(model._get_name(), sum(p.numel() for p in model.parameters()) / 1000 / 1000))
print('{} : need grad params: {:4f}M'.format(model._get_name(), sum(p.numel() for p in model.parameters() if p.requires_grad) / 1000 / 1000))

Model : all params: 268.147872M
Model : need grad params: 7.710796M


In [46]:
tconf = TrainerConfig(max_epochs=1, learning_rate=6e-4, lr_decay=True, 
                      warmup_tokens=32*200, final_tokens=1*batch_size*len(train_dataloader),
                      num_workers=1)

max_epochs 1
learning_rate 0.0006
lr_decay True
warmup_tokens 6400
final_tokens 404672
num_workers 1


In [47]:
trainer = Trainer(model, train_dataloader, valid_dataloader, tconf)

In [None]:
# start training
trainer.train()

epoch 1 iter 13: train loss 0.65636. score 0.62500. lr 4.200000e-05:   0%|          | 13/12646 [00:05<1:21:21,  2.59it/s]

In [78]:
class Predict:
    
    def __init__(self, model):
        self.model = model.to(device)
    
    def predict(self, text, entities):
        input_ids = tokenizer.encode(
                        text,                      
                        add_special_tokens = True,             
                        truncation=True,
                        padding = 'max_length',     
                        return_tensors = 'pt'       
                   ).to(device)
        
        entity_ids = [entity_to_index.get(entity, entity_to_index["<unk>"]) for entity in entities][:en_pad_size]
        for i in range(en_pad_size - len(entity_ids)):
            entity_ids.append(entity_to_index["<pad>"])
        entity_ids = torch.tensor(entity_ids).unsqueeze(0).to(device)
        entity_length = torch.tensor(len(entities)).unsqueeze(0).to(device)    
        # build entity score
        entity_score = []
        score = 1
        for en in entities:
            if en in entity_score_dict:
                en_score = float(entity_score_dict[en])
                score *= en_score
        score = math.log(10000,10)
        entity_score.append(score)
        if score >= 0:
            entity_score.append(score**2)
            entity_score.append(score**0.5)
        else:
            entity_score.append(-score**2)
            entity_score.append(-(abs(score)**0.5))
        entity_score = torch.tensor(entity_score).unsqueeze(0).to(device)    
        
        self.model.eval()
        pred = torch.sigmoid(self.model(input_ids, entity_ids, entity_length, entity_score)[0])
        return pred.item()
    
    def count_acc(self, text_list, local):
        result = []
        for text in text_list:
            result.append(self.predict(text))
        result = torch.tensor(result, dtype = torch.float)
        if local:
            acc = sum(result > 0.5).item()/len(result)
        else:
            acc = sum(result < 0.5).item()/len(result)
        return result, acc
        

In [79]:
# model.load_state_dict(torch.load("local-likely-model.pt", map_location = device))

In [80]:
predict = Predict(model)

In [41]:
test_text_list = []
test_entity_list = []
with open('data/test_data_1k.tsv') as f:
    reader= csv.reader(f, delimiter='\t')
    for line in reader:
        test_text_list.append(line[1])
        test_entity_list.append(line[2].split('|'))

In [42]:
model_predict = []
for text, entities in zip(test_text_list,test_entity_list):
    prob = predict.predict(text,entities)
    model_predict.append(prob)

In [53]:
print(len(model_predict))
fout = open('model-predict.tsv','w')
for prob in model_predict:
    fout.write('{}\n'.format(prob))

999


In [83]:
index = 31
test_text_list[index]

"Keeneland confirms case of COVID at Thoroughbred training center . A stable worker at Keeneland 's training track on Paris Pike tested positive for the coronavirus , the track confirmed . The woman , who is a hot walker -LRB- someone who walks horses as they cool down -RRB- , works at the Thoroughbred Center , said Vince Gabbert , Keeneland vice president and COO . She tested positive about three weeks ago and has recovered and returned to work . One of her family members who also works in the training center barns also self-quarantined . Gabbert said that another stable employee at Keeneland 's Rice Road barn area self-quarantined after a family member tested positive for COVID-19 . In both cases , the employees work for trainers who are part of the Lexington racetrack 's resident barn population and neither had sent horses out of state to race . A small number of horses have shipped to the handful of tracks that are still conducting live racing and returned uneventfully , Gabbert sa

In [84]:
predict.predict(test_text_list[index], test_entity_list[index])

0.631829559803009