In [1]:
question='Q1/'
args={"input_file_dir": '../snorkel-pseudo-label/'+question,
      "output_file_dir": question,
      "modelname1":'bert-base-uncased', 
      "modelname2":'allenai/scibert_scivocab_uncased',
      "hidden_size":256,
      "dropout":0.2,
      "batch_size":10,
      "epochs":100,
      "lr":0.001,
      "seed": 1,
      "early_stop_times":5,
      "science":True,
      "keylist":'keylist.txt', # for each question's keylist
      "sentence_ranking_file": 'Bert_NIH_task4.csv', #file name to save sentence_ranking 
      "device":0
}

In [7]:
#!pip install tqdm boto3 requests regex sentencepiece sacremoses
#!pip install transformers

import os
import pandas as pd
from tqdm import tqdm
import re
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from torch.optim import Adam
from torch.autograd import Variable

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import label_ranking_loss
from nltk.tokenize import sent_tokenize,word_tokenize
from transformers import BertTokenizer,BertModel, BertConfig, AdamW

    
def inputid (inputsent,tokenizername):
    tokenizer = BertTokenizer.from_pretrained(tokenizername)
    input_ids = []
    for sent in tqdm(inputsent):
        sent= word_tokenize(sent)[0:500]
        encoded_sent = tokenizer.encode(sent,add_special_tokens = True)
        input_ids.append(encoded_sent)
    return input_ids

def maxwordnum(allsec):
    allsentlen=[]
    for i in tqdm(allsec):
        wordnu=len(i)
        allsentlen.append(wordnu)
    maxnum=max(np.array(allsentlen))
    return maxnum

def dxseqpadding (seq,maxnu):
    seq2=[]
    for i in tqdm(seq):
        stamp=len(i)
        i=np.pad(i,((0,maxnu-stamp)),'constant',constant_values=0)
        seq2.append(i)
    return seq2

def attid (inputsent):
    attention_masks = []
    for sent in tqdm(inputsent):
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks

def dataloader (trainval, test,args):
    train_inputs=trainval[0]
    train_inputs = torch.tensor(train_inputs)
    train_labels=trainval[1]
    train_labels = torch.tensor(train_labels)
    train_masks=trainval[2]
    train_masks = torch.tensor(train_masks)
    
    val_inputs=trainval[3]
    val_inputs = torch.tensor(val_inputs)
    val_labels=trainval[4]
    val_labels = torch.tensor(val_labels)
    val_masks=trainval[5]
    val_masks = torch.tensor(val_masks)
    
    test_inputs=test[0]
    test_inputs = torch.tensor(test_inputs)
    test_labels=test[1]
    test_labels = torch.tensor(test_labels)
    test_masks=test[2]
    test_masks = torch.tensor(test_masks)
    train_data = TensorDataset(train_inputs, train_masks, train_labels)    
    train_dataloader = DataLoader(train_data, batch_size=args, shuffle=True)
    
    validation_data = TensorDataset(val_inputs, val_masks, val_labels)    
    validation_dataloader = DataLoader(validation_data, batch_size=args, shuffle=True)
        
    test_data = TensorDataset(test_inputs, test_masks, test_labels)
    test_dataloader = DataLoader(test_data, batch_size=args, shuffle=True)
    
    return (train_dataloader,validation_dataloader,test_dataloader)
    
def bertrnn_process (args,trainsent,valsent,testsent,trainlabel,vallabel,testlabel):
    if args['science']==True:       
        trainsci=inputid(trainsent,args['modelname2'])
        valsci=inputid(valsent,args['modelname2'])
        testsci=inputid(testsent,args['modelname2'])
        trainnor=inputid(trainsent,args['modelname1'])
        valnor=inputid(valsent,args['modelname1'])
        testnor=inputid(testsent,args['modelname1'])
        maxnum=maxwordnum(testnor)
        trainsci=dxseqpadding(trainsci,maxnum)
        valsci=dxseqpadding(valsci,maxnum)
        testsci=dxseqpadding(testsci,maxnum)
        trainnor=dxseqpadding(trainnor,maxnum)
        valnor=dxseqpadding(valnor,maxnum)
        testnor=dxseqpadding(testnor,maxnum)
        trainsciatt=attid(trainsci)
        valsciatt=attid(valsci)
        testsciatt=attid(testsci)
        trainnoratt=attid(trainnor)
        valnoratt=attid(valnor)
        testnoratt=attid(testnor)
        nortrainval=(trainnor,trainlabel,trainnoratt,valnor,vallabel,valnoratt)
        scitrainval=(trainsci,trainlabel,trainsciatt,valsci,vallabel,valsciatt)
        scitest=(testsci,testlabel,testsciatt)
        nortest=(testnor,testlabel,testnoratt)
        norloder=dataloader(nortrainval, nortest,int(args['batch_size']))
        sciloder=dataloader(scitrainval, scitest,int(args['batch_size']))
    else : 
            
        trainnor=inputid(trainsent,args['modelname1'])
        valnor=inputid(valsent,args['modelname1'])
        testnor=inputid(testsent,args['modelname1'])
        maxnum=maxwordnum(testnor)
        trainnor=dxseqpadding(trainnor,maxnum)
        valnor=dxseqpadding(valnor,maxnum)
        testnor=dxseqpadding(testnor,maxnum)
        trainnoratt=attid(trainnor)
        valnoratt=attid(valnor)
        testnoratt=attid(testnor)
        nortrainval=(trainnor,trainlabel,trainnoratt,valnor,vallabel,valnoratt)        
        nortest=(testnor,testlabel,testnoratt)
        norloder=dataloader(nortrainval, nortest,int(args['batch_size']))       
        sciloder=[]
    return norloder,sciloder


input_directory=args['input_file_dir']
with open(input_directory+args['keylist'], "r") as f:
    alist =f.read().splitlines()
    for line in alist:
        keylist=line.split(',')



trainsent=pd.read_csv(input_directory+'trainsent.csv').sent.values
valsent=pd.read_csv(input_directory+'valsent.csv').sent.values
testsent=pd.read_csv(input_directory+'testsent.csv').sent.values
trainlabel=pd.read_csv(input_directory+'trainlabel.csv')[keylist].values
vallabel=pd.read_csv(input_directory+'vallabel.csv')[keylist].values
testlabel=pd.read_csv(input_directory+'testlabel.csv').newpid.values

num_labels=len(trainlabel[0])


startw=np.count_nonzero(trainlabel== 1, axis=0)
defualt=np.max(startw)
squarer = lambda t: defualt/t
trainweight=np.array([squarer(xi) for xi in startw])



norloder,sciloder=bertrnn_process(args,trainsent,valsent,testsent,trainlabel,vallabel,testlabel)



class bert_rnn(nn.Module):
    def __init__(self, args):
        super(bert_rnn, self).__init__()
        self.args = args
        self.emb1=BertModel.from_pretrained(self.args['modelname1'],num_labels = num_labels,output_attentions = False,output_hidden_states = False)#.cuda(3)
        self.emb1_size=self.emb1.config.hidden_size
        if self.args['science']==True:
            self.emb2=BertModel.from_pretrained(self.args['modelname2'],num_labels = num_labels,output_attentions = False,output_hidden_states = False)#.cuda(3)
            self.emb2_size=self.emb2.config.hidden_size
            self.emb_size=self.emb1_size+self.emb2_size
        else:
            self.emb_size=self.emb1_size
        #self.pool = nn.MaxPool1d(args['pool_kernel_size'])
        #self.bn = nn.BatchNorm1d(self.emb_size)
        self.lin1 = nn.Linear(self.emb_size, self.args['hidden_size'])
        self.dropout = nn.Dropout(args['dropout'])
        self.lin2 = nn.Linear(self.args['hidden_size'], num_labels)
        
    def forward(self,data1,mask1,data2=None,mask2=None):
        if self.args['science']==True:
            emb1=self.emb1(data1,attention_mask=mask1)
            #pooler_output1=torch.mean(emb1[0], 1)
            #pooler_output1=torch.mean(self.pool(emb1[0].transpose(2,1)),2)
            pooler_output1 = emb1[1]
            emb2=self.emb2(data2,attention_mask=mask2)
            #pooler_output2=torch.mean(emb2[0], 1)
            #pooler_output2=torch.mean(self.pool(emb2[0].transpose(2,1)),2)
            pooler_output2 = emb2[1]
            pooler_output = self.dropout(torch.cat((pooler_output1, pooler_output2), 1))
            pooler_output = nn.functional.relu(self.lin1(pooler_output))
        else:
            emb1 =self.emb1(data1)
            #pooler_output1=torch.mean(emb1[0], 1)
            #pooler_output1=torch.mean(self.pool(emb1[0].transpose(2,1)),2)
            pooler_output1 = self.dropout(emb1[1])
            pooler_output = nn.functional.relu(self.lin1(pooler_output1))

        out = self.lin2(pooler_output)
        return out


def train_(model,loss,optimizer,dataloaders1,epoch,dataloaders2=None):
    model.train()
    allloss=[]
    allbatch=[]
    print('Train')
    if args['science']==True:
        for batch_idx, batch in tqdm(enumerate(zip(dataloaders1, dataloaders2))):
            data1, mask1, target1 = batch[0]
            data2, mask2, target2 = batch[1]
            target1 = Variable(target1).cuda()
            data1 = Variable(data1).cuda()
            mask1=Variable(mask1).cuda()
            target2 = Variable(target2).cuda()
            data2 = Variable(data2).cuda()
            mask2=Variable(mask2).cuda()
            optimizer.zero_grad()
            out = model.forward(data1,mask1,data2,mask2)
            lossall = loss(out,target1.float())            
            lossall = torch.sum(lossall)
            lossall.backward()
            optimizer.step()
            loss1=lossall.item()
            allloss.append(loss1)
            allbatch.append(batch_idx*epoch)
           # pdb.set_trace()

    else:
        for batch_idx, batch in tqdm(enumerate(dataloaders1)):
            data1, mask1, target1 = batch
            target1 = Variable(target1).cuda()
            data1 = Variable(data1).cuda()
            mask1=Variable(mask1).cuda()
            optimizer.zero_grad()
            out = model.forward(data1,mask1)
            lossall = loss(out,target1.float())            
            lossall = torch.sum(lossall)
            lossall.backward()
            optimizer.step()
            allloss.append(lossall)
            allbatch.append(batch_idx*epoch)
    return allloss, allbatch
def val_(model,dataloaders1,dataloaders2=None):
    model.eval()
    allout=[]
    alltarget=[]
    print('Validation')
    if args['science']==True:
        for batch_idx, batch in tqdm(enumerate(zip(dataloaders1, dataloaders2))):
            data1, mask1, target1 = batch[0]
            data2, mask2, target2 = batch[1]
            target1 = Variable(target1).cuda()
            data1 = Variable(data1).cuda()
            mask1=Variable(mask1).cuda()
            target2 = Variable(target2).cuda()
            data2 = Variable(data2).cuda()
            mask2=Variable(mask2).cuda()
            with torch.no_grad(): 
                out = torch.nn.functional.sigmoid(model.forward(data1,mask1,data2,mask2))
            out=out.cpu().detach().numpy()
            target=target1.to('cpu').numpy().astype(int)
            
            allout.append(list(out))
            alltarget.append(list(target))
    else:
        for batch_idx, batch in tqdm(enumerate(dataloaders1)):
            data1, mask1, target1 = batch
            target1 = Variable(target1).cuda()
            data1 = Variable(data1).cuda()
            mask1=Variable(mask1).cuda()
            with torch.no_grad(): 
                out = torch.nn.functional.igmoid(model.forward(data1,mask1,target1))
            out=out.cpu().detach().numpy()
            target=target1.to('cpu').numpy().astype(int)
            
            allout.append(list(out))
            alltarget.append(list(target))
    return allout,alltarget
def test_(model,dataloaders1,dataloaders2=None):
    model.eval()
    allout=[]
    alltarget=[]
    allattn=[]
    print('test')
    if args['science']==True:
        for batch_idx, batch in tqdm(enumerate(zip(dataloaders1, dataloaders2))):
            data1, mask1, target1 = batch[0]
            data2, mask2, target2 = batch[1]
            target1 = Variable(target1).cuda()
            data1 = Variable(data1).cuda()
            mask1=Variable(mask1).cuda()
            target2 = Variable(target2).cuda()
            data2 = Variable(data2).cuda()
            mask2=Variable(mask2).cuda()
            with torch.no_grad(): 
                out = torch.nn.functional.sigmoid(model.forward(data1,mask1,data2,mask2))
            out=out.cpu().detach().numpy()
            target=target1.to('cpu').numpy().astype(int)
            
            allout.append(out)
            alltarget.append(target)
            #allattn.append(attn)
    else:
        for batch_idx, batch in tqdm(enumerate(dataloaders1)):
            data1, mask1, target1 = batch
            target1 = Variable(target1).cuda()
            data1 = Variable(data1).cuda()
            mask1=Variable(mask1).cuda()
            with torch.no_grad(): 
                out = torch.nn.functional.sigmoid(model.forward(data1,mask1))
            out=out.cpu().detach().numpy()
            target=target1.to('cpu').numpy().astype(int)
            
            allout.append(out)
            alltarget.append(target)
            #allattn.append(attn)
    return allout,alltarget,
def result(result):
    alllist=[]
    for i in result:
        for j in i:
            alllist.append(j)
    return alllist


torch.cuda.set_device(args["device"])
model= bert_rnn(args)
model=model.to(args["device"])
params=model.parameters()
optimizer = AdamW(params,lr = 2e-5, eps = 1e-8 )
trainweight=torch.tensor(trainweight).float().cuda()
loss = nn.BCEWithLogitsLoss(pos_weight=trainweight)
alloss=[]
allbatch=[]
dev_lrl=1
vallrl=[]
testpred=[]
testpid=[]
current_early_stop_times=0
for epoch in range(1, args['epochs'] + 1):
    epochloss, epochbatch=train_(model,loss,optimizer,norloder[0],epoch,dataloaders2=sciloder[0])
    alloss.append(epochloss)
    allbatch.append(epochbatch)
    #print('BCE training loss: ', np.mean(alloss))
    allout,alltarget=val_(model,norloder[1],sciloder[1])
    allout1=result(allout)
    alltarget1=result(alltarget)
    epochlrl=label_ranking_loss(alltarget1,allout1)
    vallrl.append(epochlrl)
    if epochlrl < dev_lrl:
        print("- new best lrl{}".format(epochlrl))
        allout,alltarget=test_(model,norloder[2],sciloder[2])
        allout=result(allout)
        alltarget=result(alltarget)
        #allattn=result(allattn)
        allpd=pd.DataFrame(allout,columns=keylist)
        allpd['newpid']=alltarget
        os.makedirs(question, exist_ok=True)
        allpd.to_csv(args['output_file_dir']+args['sentence_ranking_file'])
        dev_lrl = epochlrl
        current_early_stop_times = 0
    else:
        current_early_stop_times += 1
        print(current_early_stop_times)
    if current_early_stop_times >= args['early_stop_times'] :
        break;
print ("- early stopping {} epochs without improvement".format(epoch))


I0413 18:10:06.574221 140255500728064 tokenization_utils.py:420] Model name 'allenai/scibert_scivocab_uncased' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, bert-base-finnish-cased-v1, bert-base-finnish-uncased-v1, bert-base-dutch-cased). Assuming 'allenai/scibert_scivocab_uncased' is a path, a model identifier, or url to a directory containing tokenizer files.
I0413 18:10:07.288724 140255500728064 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_uncased/vocab.txt from cache at

I0413 18:10:27.750638 140255500728064 modeling_utils.py:507] loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin from cache at /root/.cache/torch/transformers/aa1ef1aede4482d0dbcd4d52baad8ae300e60902e88fcb0bebdec09afd232066.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
I0413 18:10:30.207480 140255500728064 configuration_utils.py:283] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_uncased/config.json from cache at /root/.cache/torch/transformers/199e28e62d2210c23d63625bd9eecc20cf72a156b29e2a540d4933af4f50bda1.79c4dd84b76a6991002b44cd58102c732c37aba834ad6401ddd6a89bd0ed809b
I0413 18:10:30.209053 140255500728064 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 8,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "earl

Train


66it [00:17,  3.79it/s]
2it [00:00, 13.03it/s]

Validation


17it [00:01, 13.50it/s]
2it [00:00, 13.06it/s]

- new best lrl0.1541125541125541
test


2574it [03:16, 13.10it/s]
0it [00:00, ?it/s]

Train


66it [00:17,  3.82it/s]
2it [00:00, 13.06it/s]

Validation


17it [00:01, 13.45it/s]
2it [00:00, 11.79it/s]

- new best lrl0.14718614718614717
test


2574it [03:18, 12.99it/s]
0it [00:00, ?it/s]

Train


66it [00:17,  3.82it/s]
2it [00:00, 13.17it/s]

Validation


17it [00:01, 13.41it/s]
0it [00:00, ?it/s]

1
Train


66it [00:17,  3.81it/s]
2it [00:00, 13.07it/s]

Validation


17it [00:01, 13.46it/s]
2it [00:00, 13.00it/s]

- new best lrl0.12294372294372291
test


2574it [03:16, 13.09it/s]
0it [00:00, ?it/s]

Train


66it [00:17,  3.83it/s]
2it [00:00, 13.19it/s]

Validation


17it [00:01, 13.47it/s]
2it [00:00, 13.10it/s]

- new best lrl0.07012987012987011
test


2574it [03:16, 13.12it/s]
0it [00:00, ?it/s]

Train


66it [00:17,  3.82it/s]
2it [00:00, 13.19it/s]

Validation


17it [00:01, 13.50it/s]
0it [00:00, ?it/s]

1
Train


66it [00:17,  3.82it/s]
2it [00:00, 13.20it/s]

Validation


17it [00:01, 13.50it/s]
2it [00:00, 13.06it/s]

- new best lrl0.06926406926406925
test


2574it [03:16, 13.10it/s]
0it [00:00, ?it/s]

Train


66it [00:17,  3.81it/s]
2it [00:00, 13.18it/s]

Validation


17it [00:01, 13.46it/s]
2it [00:00, 13.09it/s]

- new best lrl0.06753246753246753
test


2574it [03:16, 13.08it/s]
0it [00:00, ?it/s]

Train


66it [00:17,  3.82it/s]
2it [00:00, 13.19it/s]

Validation


17it [00:01, 13.49it/s]
2it [00:00, 13.06it/s]

- new best lrl0.06666666666666667
test


2574it [03:16, 13.10it/s]
0it [00:00, ?it/s]

Train


66it [00:17,  3.82it/s]
2it [00:00, 13.20it/s]

Validation


17it [00:01, 13.49it/s]
2it [00:00, 13.09it/s]

- new best lrl0.062337662337662345
test


2574it [03:16, 13.11it/s]
0it [00:00, ?it/s]

Train


66it [00:17,  3.82it/s]
2it [00:00, 13.21it/s]

Validation


17it [00:01, 13.47it/s]
0it [00:00, ?it/s]

1
Train


66it [00:17,  3.85it/s]
2it [00:00, 13.20it/s]

Validation


17it [00:01, 13.52it/s]
2it [00:00, 13.07it/s]

- new best lrl0.06233766233766233
test


2574it [03:15, 13.14it/s]
0it [00:00, ?it/s]

Train


66it [00:17,  3.85it/s]
2it [00:00, 13.20it/s]

Validation


17it [00:01, 13.50it/s]
0it [00:00, ?it/s]

1
Train


66it [00:17,  3.84it/s]
2it [00:00, 13.22it/s]

Validation


17it [00:01, 13.51it/s]
0it [00:00, ?it/s]

2
Train


66it [00:17,  3.85it/s]
2it [00:00, 13.22it/s]

Validation


17it [00:01, 13.51it/s]
2it [00:00, 13.08it/s]

- new best lrl0.04675324675324675
test


2574it [03:16, 13.12it/s]
0it [00:00, ?it/s]

Train


66it [00:17,  3.85it/s]
2it [00:00, 13.22it/s]

Validation


17it [00:01, 13.54it/s]
0it [00:00, ?it/s]

1
Train


66it [00:17,  3.86it/s]
2it [00:00, 13.21it/s]

Validation


17it [00:01, 13.48it/s]
0it [00:00, ?it/s]

2
Train


66it [00:17,  3.85it/s]
2it [00:00, 13.22it/s]

Validation


17it [00:01, 13.49it/s]
0it [00:00, ?it/s]

3
Train


66it [00:17,  3.85it/s]
2it [00:00, 13.21it/s]

Validation


17it [00:01, 13.49it/s]
0it [00:00, ?it/s]

4
Train


66it [00:17,  3.85it/s]
2it [00:00, 13.17it/s]

Validation


17it [00:01, 13.48it/s]


5
- early stopping 20 epochs without improvement
