In [1]:
%cd /home/mtech1/19CS60R28/susmit/DocRed_hongwang600/DocRed
import sys
import math
import nltk
import numpy as np
import os
import json
import copy
import random
import time
import datetime

# import seaborn as sns
from nltk.tokenize import WordPunctTokenizer
from pytorch_transformers import *
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import torch.nn.functional as F
from tqdm import tqdm
from models.bert import Bert
%cd ..

%matplotlib inline
%config InlineBackend.figure_format='retina'
torch.backends.cudnn.deterministic = True

RANDOM_SEED = 42

/home/mtech1/19CS60R28/susmit/DocRed_hongwang600/DocRed
/home/mtech1/19CS60R28/susmit/DocRed_hongwang600


### Configuration

In [2]:
in_path='data'
out_path='prepro_data'
os.environ['CUDA_VISIBLE_DEVICES']="0,1"
PRE_TRAINED_MODEL_NAME='bert-base-uncased'
model_name='bert'
BATCH_SIZE=4
EPOCH = 10

n_gpu = torch.cuda.device_count()
if not os.path.exists(out_path):
    os.mkdir(out_path)

MAX_LEN=512
SEP='[SEP]'
MASK = '[MASK]'
CLS = "[CLS]"
bert = Bert(BertModel, PRE_TRAINED_MODEL_NAME)

# Data

In [3]:
train_annotated_file_name = os.path.join(in_path, 'train_annotated.json')
dev_file_name = os.path.join(in_path, 'dev.json')
test_file_name = os.path.join(in_path, 'test.json')

In [4]:

rel2id = json.load(open(os.path.join(out_path, 'rel2id.json'), "r"))
id2rel = {v:u for u,v in rel2id.items()}
json.dump(id2rel, open(os.path.join(out_path, 'id2rel.json'), "w"))

In [5]:
def logging(*msg, print_=True, log_=True):
    for i in range(0,len(msg)):
        if(i==len(msg)-1):
            end='\n'
        else:
            end=' '
        if print_:
            print(msg[i],end=end)
        if log_:
            with open(os.path.join(os.path.join("log", model_name+'.txt')), 'a+') as f_log:
                f_log.write(str(msg[i])+end)
                f_log.close()

In [6]:


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [7]:
def set_random_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_random_seeds(RANDOM_SEED)

In [8]:
def load_ckp(checkpoint_fpath, model, optimizer):
    try:
        checkpoint = torch.load(checkpoint_fpath)
    except OSError as e:
        return -1,-1,-1.0,0,model,optimizer 
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    return checkpoint['best_epoch_idx'], checkpoint['best_epoch_seed'], checkpoint['best_dev_acc'], checkpoint['epoch'], model, optimizer


# Preprocessing

In [9]:
#to do add union of evidences=> done
# for sent pad use 2

def preprocess(data_file_name, max_length = 512, is_training = True, suffix=''):
    ori_data=json.load(open(data_file_name))[0:100]
    max_sent_count=0#maximum number of sentences in a doc across the dataset
    list_sent_ids=[]
    list_attention=[]#this stores attention of docs
    list_sent_mask=[]#this will be used in the batch multliplication for getting the embeddings of each sentence
    # (len(list_sent_ids),max_sent_count,max_length)
    
    labels=[]
    i=0
    for doc in ori_data:
        i=i+1
        sys.stdout.write("\r%d/%d docs"%(i,len(ori_data)))
        # this dict is used to take care of multiple relations with same head and tail
        head_tail_index={}
        max_sent_count=max(max_sent_count,len(doc['sents']))
        for label in doc['labels']:
            idx_list=[]
            head=doc['vertexSet'][label['h']]
            tail=doc['vertexSet'][label['t']]
            if (label['h'],label['t']) in head_tail_index:
                labels[head_tail_index[(label['h'],label['t'])]]+=label['evidence']
                continue
            else:
                head_tail_index[(label['h'],label['t'])]=len(list_sent_ids)
            for entity in head:
                if (entity['sent_id'],entity['pos'][0],'[unused0]') not in idx_list:
                    idx_list.append((entity['sent_id'],entity['pos'][0],'[unused0]'))
                if (entity['sent_id'],entity['pos'][1]+1,'[unused1]') not in idx_list:
                    idx_list.append((entity['sent_id'],entity['pos'][1]+1,'[unused1]'))
            for entity in tail:
                if (entity['sent_id'],entity['pos'][0],'[unused2]') not in idx_list:
                    idx_list.append((entity['sent_id'],entity['pos'][0],'[unused2]'))
                if (entity['sent_id'],entity['pos'][1]+1,'[unused3]') not in idx_list:
                    idx_list.append((entity['sent_id'],entity['pos'][1]+1,'[unused3]'))
            idx_list.sort(key=lambda tup:(tup[0],tup[1]),reverse=True)
            temp_doc=copy.deepcopy(doc)
            for loc in idx_list:
                temp_doc['sents'][loc[0]].insert(loc[1],loc[2])

            sent_combine=[]
            for sent in temp_doc['sents']:
                sent_combine=sent_combine+sent
            sent_ids,sent_attention_mask,sent_start_ids=bert.subword_tokenize_to_ids(sent_combine)
            list_sent_ids.append(sent_ids[0])
            list_attention.append(sent_attention_mask[0])
            labels.append(label['evidence'])
            
            
            sent_mask=[]
            l=1# we start from index 1 because we skip CLS token
            for sent in temp_doc['sents']:
                sent_mask.append([0]*max_length)
                j=l
#                 print(sent)
#                 print("\n")
                while(j<min(max_length-2,l+len(sent))):
                    sent_mask[-1][j]=1
                    j+=1
                l+=len(sent)
                if(l>=max_length-2):
                    break
            list_sent_mask.append(sent_mask)
            
    logging('')
    evi_labels = np.zeros((len(labels),max_sent_count),dtype = np.int64)
    for i in range(len(labels)):
        evi_labels[i][labels[i]]=1 #if evidence present then 1
    print("max_sent_cout",max_sent_count)
    for i in range(len(list_sent_mask)):
        # the label for pad sentence is 2
        evi_labels[i][len(list_sent_mask[i]):max_sent_count]=2
        # to pad sentences with arrays of 1s
        list_sent_mask[i]=list_sent_mask[i]+[[1]*max_length]*(max_sent_count-len(list_sent_mask[i]))
    list_sent_ids=np.asarray(list_sent_ids,dtype=np.int64)
    list_attention=np.asarray(list_attention,dtype=np.int64)
    list_sent_mask=np.asarray(list_sent_mask,dtype=np.int64)
    
    logging("Started saving")
    
    logging("Number of instances: {}".format(list_sent_ids.shape[0]))
    np.save(os.path.join(out_path,suffix+'_sent_ids.npy'),list_sent_ids)
    np.save(os.path.join(out_path,suffix+'_sent_attention.npy'),list_attention)
    np.save(os.path.join(out_path,suffix+'_sent_mask.npy'),list_sent_mask)
    np.save(os.path.join(out_path,suffix+'_evidence_labels.npy'),evi_labels)
    logging("completed saving\n")
    


In [10]:
%%timeit -n1 -r1
preprocess(train_annotated_file_name, max_length = 512, is_training = False, suffix='train')
preprocess(dev_file_name, max_length = 512, is_training = False, suffix='dev')

100/100 docs
max_sent_cout 19
Started saving
Number of instances: 1133
completed saving

100/100 docs
max_sent_cout 18
Started saving
Number of instances: 1246
completed saving

48.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### Data loading

In [11]:
class Docred_dataset(Dataset):
    def __init__(self,sent_ids,sent_attention,sent_mask,evi_target,max_len=512):
        self.sent_ids=torch.from_numpy(sent_ids)
        self.sent_attention=torch.from_numpy(sent_attention)
        self.sent_mask=torch.from_numpy(sent_mask)
        self.evi_target=torch.from_numpy(evi_target)
        self.no_samples=evi_target.shape[0]
    def __len__(self):
        return evi_target.shape[0]
    def __getitem__(self,index):
        return {
            'sent_ids':self.sent_ids[index],
            'sent_attention':self.sent_attention[index],
            'sent_mask':self.sent_mask[index],
            'targets':self.evi_target[index]
        }

In [12]:
sent_ids=np.load(os.path.join(out_path,'train'+'_sent_ids.npy'))
sent_attention=np.load(os.path.join(out_path,'train'+'_sent_attention.npy'))
sent_mask=np.load(os.path.join(out_path,'train'+'_sent_mask.npy'))
evi_target=np.load(os.path.join(out_path,'train'+'_evidence_labels.npy'))

In [13]:


dataset=Docred_dataset(sent_ids=sent_ids,sent_attention=sent_attention,sent_mask=sent_mask,evi_target=evi_target,max_len=MAX_LEN)
dataloader=DataLoader(dataset=dataset, batch_size=BATCH_SIZE,num_workers=2)
# def create_data_loader(max_len,batch_size):
#     ds=Docred(sent_ids,sent_mask,evi_target,max_len=512)
#     return DataLoader(ds,batch_size=batch_size,num_workers=4)
# train_data_loader=create_data_loader(max_len=MAX_LEN,batch_size=BATCH_SIZE)

### Defining Model

In [14]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device

In [15]:
# sent_mask = (b,k,t)
# k=max sent length
# output =(b,t,h)
# torch.bmm
# (b,k,t)*(b,t,h)/(len(sent))==(b,k,h)
# torch.sum(dim)  torch.sum(input, dim, keepdim=False, *, dtype=None) → Tensor


class EvidenceClassifier(nn.Module):
    def __init__(self):
        super(EvidenceClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.dense = nn.Linear(self.bert.config.hidden_size, 3)
        self.logsoftmax = nn.LogSoftmax(dim=-1)
        self.softmax = nn.Softmax(dim=-1)
    def forward(self, input_ids, attention_mask,sent_mask,is_training=False):
        last_hidden_state, pooled_output = self.bert(input_ids=input_ids,attention_mask=attention_mask)
        sent_mask=sent_mask.float()
        output=torch.bmm(sent_mask,last_hidden_state)/sent_mask.sum(axis=2)[...,None]
        logits = self.dense(output)
        if is_training:
            return self.logsoftmax(logits)
        else:
            return self.softmax(logits)

        
#         #BMM
#         2
#             10->3sent
#             5->2sent
#             sent_mask=(2,3,10)
#             length of each sent mask->10
#             1st doc->1st sent(0,3)
#             each sent_mask=[1,1,1,1,0,0,0,...]
#             2nd doc ->(pad sent) use all ones 
#             2nd doc->3rd sent->[1,1,1,...](to divide zero issue)
#             bert_output->(2,10,768)
#             sent_mask*bert_output(BMM)
#             output=(2,3,768)=(2,3,10)*(2,10,768)
#             output(2,3,3)=Linear(output,3)


In [16]:
# model=EvidenceClassifier()

In [17]:
# input_ids=

In [18]:
#         loss(input(output of the model),target)
            #no_evidence_sent=>lablel=0
            #evidence_sent=>label=1
            #evidence_pad=>lebel=2
#             loss(ignore_index=2) 
#             NLLLoss
# https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html?highlight=nllloss#torch.nn.NLLLoss

In [19]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

In [20]:
# input_ids = torch.tensor(tokenizer.encode("[CLS] Hello my dog is cute")).unsqueeze(0)  # Batch size 1
# outputs = model(input_ids)
# last_hidden_state, pooler_output = outputs

In [21]:
# print(last_hidden_state.shape)
# print(pooler_output.shape)
BATCH_SIZE

4

# Training

In [28]:
def train(input_ids,sent_attention,sent_mask,evi_target):
    train_size=input_ids.shape[0]
    batch_size=BATCH_SIZE
    batch_count=int(math.ceil(train_size)/batch_size)
    model=EvidenceClassifier()
#     logging(model)
    
    if torch.cuda.is_available():
        model.cuda
        model = torch.nn.DataParallel(model)
    criterion = nn.NLLLoss(reduction='mean',ignore_index=2)
    optimizer = AdamW(model.parameters(),lr=1e-05,correct_bias=False)
    
    logging(optimizer)
    
    best_dev_acc = -1
    best_epoch_idx = -1
    best_epoch_seed = -1
    start_epoch = 0
    ckp_path=os.path.join('checkpoint',model_name+'_checkpoint.pt')
    best_epoch_idx,best_epoch_seed,best_dev_acc,start_epoch,model,optimizer=load_ckp(ckp_path, model, optimizer)
    
    train_dataset=Docred_dataset(sent_ids=input_ids,sent_attention=sent_attention,sent_mask=sent_mask,evi_target=evi_target,max_len=MAX_LEN)
    train_dataloader=DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE,num_workers=2)
    for epoch_idx in range(start_epoch, EPOCH):
        model.train()
        model.zero_grad()
        logging('Epoch:', epoch_idx + 1)
        cur_seed = RANDOM_SEED + epoch_idx + 1
        set_random_seeds(cur_seed)
        
        start_time = datetime.datetime.now()
        train_loss_val = 0
        is_best = False
        
        for i,data in enumerate(tqdm(train_dataloader)):
            batch_sent_ids = data['sent_ids']
            batch_sent_attention = data['sent_attention']
            batch_sent_mask = data['sent_mask']
            batch_evi_targets=data['targets']
            
            
            # print(batch_sent_ids.shape)
            # print(batch_sent_attention.shape)
            # print(batch_sent_mask.shape)
            # print(batch_evi_targets.shape)
            if torch.cuda.is_available():
                batch_sent_ids = batch_sent_ids.cuda()
                batch_sent_ids = batch_sent_attention.cuda()
                batch_sent_mask = batch_sent_mask.cuda()
                batch_evi_targets = batch_evi_targets.cuda()
            
            outputs = model(batch_sent_ids,batch_sent_ids,batch_sent_mask,is_training=True)
            loss = criterion(outputs.reshape((outputs.shape[0]*outputs.shape[1],outputs.shape[2])),batch_evi_targets.reshape((batch_evi_targets.shape[0]*batch_evi_targets.shape[1])))
            loss.backward()
            train_loss_val+=loss.item()
        train_loss_val/=batch_count
        end_time = datetime.datetime.now()
        logging('Training_loss: ',train_loss_val)
        logging('Time: ',end_time-start_time)
    logging("*"*50)
        

In [29]:
train(input_ids=sent_ids,sent_attention=sent_attention,sent_mask = sent_mask, evi_target = evi_target)

  0%|          | 0/284 [00:00<?, ?it/s]

AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: False
    eps: 1e-06
    lr: 1e-05
    weight_decay: 0.0
)
Epoch: 1


100%|██████████| 284/284 [03:38<00:00,  1.30it/s]


Training_loss:  1.0710816391786502
Time:  0:03:38.994988
Epoch: 2


100%|██████████| 284/284 [03:38<00:00,  1.30it/s]
  0%|          | 0/284 [00:00<?, ?it/s]

Training_loss:  1.0700294552337997
Time:  0:03:38.840818
Epoch: 3


100%|██████████| 284/284 [03:39<00:00,  1.30it/s]
  0%|          | 0/284 [00:00<?, ?it/s]

Training_loss:  1.0703184600432434
Time:  0:03:39.096291
Epoch: 4


100%|██████████| 284/284 [03:39<00:00,  1.30it/s]
  0%|          | 0/284 [00:00<?, ?it/s]

Training_loss:  1.0695986347569173
Time:  0:03:39.226941
Epoch: 5


100%|██████████| 284/284 [03:39<00:00,  1.30it/s]
  0%|          | 0/284 [00:00<?, ?it/s]

Training_loss:  1.0700902930418088
Time:  0:03:39.125286
Epoch: 6


100%|██████████| 284/284 [03:39<00:00,  1.30it/s]


Training_loss:  1.0698748620575813
Time:  0:03:39.175527
Epoch: 7


100%|██████████| 284/284 [03:39<00:00,  1.30it/s]
  0%|          | 0/284 [00:00<?, ?it/s]

Training_loss:  1.0708993658581387
Time:  0:03:39.056320
Epoch: 8


100%|██████████| 284/284 [03:39<00:00,  1.30it/s]
  0%|          | 0/284 [00:00<?, ?it/s]

Training_loss:  1.0698550920604395
Time:  0:03:39.223055
Epoch: 9


100%|██████████| 284/284 [03:39<00:00,  1.30it/s]
  0%|          | 0/284 [00:00<?, ?it/s]

Training_loss:  1.0697104593890294
Time:  0:03:39.203251
Epoch: 10


100%|██████████| 284/284 [03:39<00:00,  1.30it/s]


Training_loss:  1.0694955970710243
Time:  0:03:39.143903
**************************************************


In [24]:
# import torch
# import sys
# print('A', sys.version)
# print('B', torch.__version__)
# print('C', torch.cuda.is_available())
# print('D', torch.backends.cudnn.enabled)
# device = torch.device('cuda')
# print('E', torch.cuda.get_device_properties(device))
# print('F', torch.tensor([1.0, 2.0]).cuda())

In [25]:
x=torch.tensor([[[1,2,3],[4,5,6],[7,8,9]],[[10,11,12],[13,14,15],[16,17,18]]])
print(x)
print(x.reshape(6,3))

tensor([[[ 1,  2,  3],
         [ 4,  5,  6],
         [ 7,  8,  9]],

        [[10, 11, 12],
         [13, 14, 15],
         [16, 17, 18]]])
tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]])
