In [2]:
!nvidia-smi

Mon May 17 11:32:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K40m          Off  | 00000000:81:00.0 Off |                    0 |
| N/A   34C    P8    23W / 235W |     11MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [3]:
%cd /home/mtech1/19CS60R28/susmit/DocRed_hongwang600/DocRed
import sys
import math
import nltk
import numpy as np
import os
import json
import copy
import random
import time
import datetime

# import seaborn as sns
from nltk.tokenize import WordPunctTokenizer
from pytorch_transformers import *
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import torch.nn.functional as F
from tqdm import tqdm
from models.bert import Bert
%cd ..

%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED = 42

/home/mtech1/19CS60R28/susmit/DocRed_hongwang600/DocRed
/home/mtech1/19CS60R28/susmit/DocRed_hongwang600


### Configuration

In [None]:
in_path='data'
out_path='prepro_data'
os.environ['CUDA_VISIBLE_DEVICES']="0,1"
PRE_TRAINED_MODEL_NAME='bert-base-uncased'
model_name='bert'
BATCH_SIZE=8

n_gpu = torch.cuda.device_count()
if not os.path.exists(out_path):
    os.mkdir(out_path)

MAX_LEN=512
SEP='[SEP]'
MASK = '[MASK]'
CLS = "[CLS]"
bert = Bert(BertModel, PRE_TRAINED_MODEL_NAME)

# Data

In [5]:
train_annotated_file_name = os.path.join(in_path, 'train_annotated.json')
dev_file_name = os.path.join(in_path, 'dev.json')
test_file_name = os.path.join(in_path, 'test.json')

In [6]:

rel2id = json.load(open(os.path.join(out_path, 'rel2id.json'), "r"))
id2rel = {v:u for u,v in rel2id.items()}
json.dump(id2rel, open(os.path.join(out_path, 'id2rel.json'), "w"))

In [8]:
def logging(s, print_=True, log_=True):
    if print_:
        print(s)
    if log_:
        with open(os.path.join(os.path.join("log", model_name+'.txt')), 'a+') as f_log:
            f_log.write(str(s)+'\n')

In [9]:


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [10]:
def set_random_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_random_seeds(RANDOM_SEED)

# Preprocessing

In [11]:
#to do add union of evidences=> done
# for sent pad use 2

def preprocess(data_file_name, max_length = 512, is_training = True, suffix=''):
    ori_data=json.load(open(data_file_name))[0:10]
    max_sent_count=0#maximum number of sentences in a doc across the dataset
    list_sent_ids=[]
    list_attention=[]#this stores attention of docs
    list_sent_mask=[]#this will be used in the batch multliplication for getting the embeddings of each sentence
    # (len(list_sent_ids),max_sent_count,max_length)
    
    labels=[]
    i=0
    for doc in ori_data:
        i=i+1
        sys.stdout.write("\r%d/%d docs"%(i,len(ori_data)))
        # this dict is used to take care of multiple relations with same head and tail
        head_tail_index={}
        max_sent_count=max(max_sent_count,len(doc['sents']))
        for label in doc['labels']:
            idx_list=[]
            head=doc['vertexSet'][label['h']]
            tail=doc['vertexSet'][label['t']]
            if (label['h'],label['t']) in head_tail_index:
                labels[head_tail_index[(label['h'],label['t'])]]+=label['evidence']
                continue
            else:
                head_tail_index[(label['h'],label['t'])]=len(list_sent_ids)
            for entity in head:
                if (entity['sent_id'],entity['pos'][0],'[unused0]') not in idx_list:
                    idx_list.append((entity['sent_id'],entity['pos'][0],'[unused0]'))
                if (entity['sent_id'],entity['pos'][1]+1,'[unused1]') not in idx_list:
                    idx_list.append((entity['sent_id'],entity['pos'][1]+1,'[unused1]'))
            for entity in tail:
                if (entity['sent_id'],entity['pos'][0],'[unused2]') not in idx_list:
                    idx_list.append((entity['sent_id'],entity['pos'][0],'[unused2]'))
                if (entity['sent_id'],entity['pos'][1]+1,'[unused3]') not in idx_list:
                    idx_list.append((entity['sent_id'],entity['pos'][1]+1,'[unused3]'))
            idx_list.sort(key=lambda tup:(tup[0],tup[1]),reverse=True)
            temp_doc=copy.deepcopy(doc)
            for loc in idx_list:
                temp_doc['sents'][loc[0]].insert(loc[1],loc[2])

            sent_combine=[]
            for sent in temp_doc['sents']:
                sent_combine=sent_combine+sent
            sent_ids,sent_attention_mask,sent_start_ids=bert.subword_tokenize_to_ids(sent_combine)
            list_sent_ids.append(sent_ids[0])
            list_attention.append(sent_attention_mask[0])
            labels.append(label['evidence'])
            
            
            sent_mask=[]
            l=1# we start from index 1 because we skip CLS token
            for sent in temp_doc['sents']:
                sent_mask.append([0]*max_length)
                j=l
#                 print(sent)
#                 print("\n")
                while(j<min(max_length-2,l+len(sent))):
                    sent_mask[-1][j]=1
                    j+=1
                l+=len(sent)
                if(l>=max_length-2):
                    break
            list_sent_mask.append(sent_mask)
            
    logging('')
    evi_labels = np.zeros((len(labels),max_sent_count),dtype = np.int64)
    for i in range(len(labels)):
        evi_labels[i][labels[i]]=1 #if evidence present then 1
    print("max_sent_cout",max_sent_count)
    for i in range(len(list_sent_mask)):
        # the label for pad sentence is 2
        evi_labels[i][len(list_sent_mask[i]):max_sent_count]=2
        # to pad sentences with arrays of 1s
        list_sent_mask[i]=list_sent_mask[i]+[[1]*max_length]*(max_sent_count-len(list_sent_mask[i]))
    list_sent_ids=np.asarray(list_sent_ids,dtype=np.int64)
    list_attention=np.asarray(list_attention,dtype=np.int64)
    list_sent_mask=np.asarray(list_sent_mask,dtype=np.int64)
    
    logging("Started saving")
    
    logging("Number of instances: {}".format(list_sent_ids.shape[0]))
    np.save(os.path.join(out_path,suffix+'_sent_ids.npy'),list_sent_ids)
    np.save(os.path.join(out_path,suffix+'_sent_attention.npy'),list_attention)
    np.save(os.path.join(out_path,suffix+'_sent_mask.npy'),list_sent_mask)
    np.save(os.path.join(out_path,suffix+'_evidence_labels.npy'),evi_labels)
    logging("completed saving\n")
    


In [12]:
%%timeit -n1 -r1
preprocess(train_annotated_file_name, max_length = 512, is_training = False, suffix='train')
preprocess(dev_file_name, max_length = 512, is_training = False, suffix='dev')

10/10 docs
max_sent_cout 12
Started saving
Number of instances: 111
completed saving

10/10 docs
max_sent_cout 13
Started saving
Number of instances: 161
completed saving

7.54 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### Data loading

In [13]:
class Docred_dataset(Dataset):
    def __init__(self,sent_ids,sent_attention,sent_mask,evi_target,max_len=512):
        self.sent_ids=torch.from_numpy(sent_ids)
        self.sent_attention=torch.from_numpy(sent_attention)
        self.sent_mask=torch.from_numpy(sent_mask)
        self.evi_target=torch.from_numpy(evi_target)
        self.no_samples=evi_target.shape[0]
    def __len__(self):
        return evi_target.shape[0]
    def __getitem__(self,index):
        return {
            'sent_ids':self.sent_ids[index],
            'sent_attention':self.sent_attention[index],
            'sent_mask':self.sent_mask[index],
            'targets':self.evi_target[index]
        }

In [14]:
sent_ids=np.load(os.path.join(out_path,'train'+'_sent_ids.npy'))
sent_attention=np.load(os.path.join(out_path,'train'+'_sent_attention.npy'))
sent_mask=np.load(os.path.join(out_path,'train'+'_sent_mask.npy'))
evi_target=np.load(os.path.join(out_path,'train'+'_evidence_labels.npy'))

In [15]:


dataset=Docred_dataset(sent_ids=sent_ids,sent_attention=sent_attention,sent_mask=sent_mask,evi_target=evi_target,max_len=MAX_LEN)
dataloader=DataLoader(dataset=dataset, batch_size=BATCH_SIZE,num_workers=2)
# def create_data_loader(max_len,batch_size):
#     ds=Docred(sent_ids,sent_mask,evi_target,max_len=512)
#     return DataLoader(ds,batch_size=batch_size,num_workers=4)
# train_data_loader=create_data_loader(max_len=MAX_LEN,batch_size=BATCH_SIZE)

### Defining Model

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [20]:
# sent_mask = (b,k,t)
# k=max sent length
# output =(b,t,h)
# torch.bmm
# (b,k,t)*(b,t,h)/(len(sent))==(b,k,h)
# torch.sum(dim)  torch.sum(input, dim, keepdim=False, *, dtype=None) → Tensor


class EvidenceClassifier(nn.Module):
    def __init__(self):
        super(EvidenceClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
#         self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, 3)
    def forward(self, input_ids, attention_mask,sent_mask):
        last_hidden_state, pooled_output = self.bert(input_ids=input_ids,attention_mask=attention_mask)
        output=torch.BMM(sent_mask,last_hidden_state)/torch.sum(sent_mask,dim=2)
        output = self.out(output)
        return output
#         #BMM
#         2
#             10->3sent
#             5->2sent
#             sent_mask=(2,3,10)
#             length of each sent mask->10
#             1st doc->1st sent(0,3)
#             each sent_mask=[1,1,1,1,0,0,0,...]
#             2nd doc ->(pad sent) use all ones 
#             2nd doc->3rd sent->[1,1,1,...](to divide zero issue)
#             bert_output->(2,10,768)
#             sent_mask*bert_output(BMM)
#             output=(2,3,768)=(2,3,10)*(2,10,768)
#             output(2,3,3)=Linear(output,3)


In [21]:
model=EvidenceClassifier()

In [26]:
input_ids=

TypeError: forward() missing 3 required positional arguments: 'input_ids', 'attention_mask', and 'sent_mask'

In [None]:
#         loss(input(output of the model),target)
            #no_evidence_sent=>lablel=0
            #evidence_sent=>label=1
            #evidence_pad=>lebel=2
#             loss(ignore_index=2) 
#             NLLLoss
# https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html?highlight=nllloss#torch.nn.NLLLoss

In [51]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [57]:
input_ids = torch.tensor(tokenizer.encode("[CLS] Hello my dog is cute")).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)
last_hidden_state, pooler_output = outputs

In [58]:
print(last_hidden_state.shape)
print(pooler_output.shape)

torch.Size([1, 6, 768])
torch.Size([1, 768])


100%|██████████| 100000000/100000000 [00:25<00:00, 3882668.63it/s]


# Training

1