# part1: pre-processing-text-for-GPT2-fine-tuning.ipynb

##Load Packages


In [None]:
#%cd '/content/drive/My Drive/Colab Notebooks/GPT-2/summarization_preprocessing'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import timeit

import torch
print(torch.__version__,' pytorch version')
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

!pip install transformers==2.6.0

1.7.0+cu101  pytorch version


In [None]:
import transformers
print(transformers.__version__,' make sure transformers version is 2.6.0')
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','pad_token':'<pad>','additional_special_tokens':['<|keyword|>','<|summarize|>']}
tokenizer.add_special_tokens(special_tokens)
print(len(tokenizer), 'total length of vocab')
print(tokenizer.bos_token_id, 'bos_token')
print(tokenizer.eos_token_id, 'eos_token')
print(tokenizer.pad_token_id, 'pad_token')  #token for <pad>, len of all tokens in the tokenizer
print(tokenizer.additional_special_tokens_ids[0], 'keyword_token') #token for <|keyword|>
print(tokenizer.additional_special_tokens_ids[1], 'summary_token') #token for <|summarize|>

2.6.0  make sure transformers version is 2.6.0
50261 total length of vocab
50257 bos_token
50256 eos_token
50258 pad_token
50259 keyword_token
50260 summary_token


##Load the dataset (before processing)¶
reset index before running pre-processing so the columns will match up.



In [None]:
frame_dev = pd.read_csv('/content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021/COVID19_train_data.csv',index_col=0)


In [None]:
frame_dev = frame_dev.reset_index()
frame_dev.head(5)

Unnamed: 0,index,sha,title,abstract,keyword_POS,keyword_NER,full_text_file,keyword_POS_str
0,29933,65b6cc079b3c31133dd5d20ade29e8bbab46be98,Immunotherapy against experimental canine visc...,Abstract In order to assess the immunotherapeu...,"['order', 'assess', 'potential', 'leishmaniasi...","['In', 'order', 'to', 'assess', 'the', 'immuno...",custom_license,Abstract order assess immunotherapeutic potent...
1,11345,c2952e4d56bd9bf5c6eee0aaa733557471e60a92,Discovery of Bat Coronaviruses through Surveil...,Coronaviruses (CoVs) of bat origin have caused...,"['Coronaviruses', 'CoVs', 'origin', 'have', 'p...","['Coronaviruses', 'CoVs', 'of', 'bat', 'origin...",comm_use_subset,Coronaviruses CoVs bat origin have caused pand...
2,17525,4a69bc15bc918cf769b3d5188e2394ee149a6d16,"Keel-, neus-, oorziekten",Hypertrofie van het lymfoïde weefsel in de nas...,"['Hypertrofie', 'lymfoïde', 'weefsel', 'nasofa...","['Hypertrofie', 'van', 'het', 'lymfoïde', 'wee...",custom_license,Hypertrofie van het lymfoïde weefsel de nasofa...
3,20795,f72abb1b0683ec97335167e3e3b8c2790531654e,Rabbit Clinical Pathology,"Abstract With rabbit patients, as in other spe...","['rabbit', 'patients', 'species', 'blood', 'ur...","['With', 'rabbit', 'patients', 'as', 'in', 'ot...",custom_license,Abstract rabbit patients other species analyzi...
4,23210,f93daa3d8d53761cfcf4c9fb8ff03dace02a858c,Inhibition of arterivirus RNA synthesis by cyc...,"AbstractPreviously, the cyclophilin inhibitors...","['cyclophilin', 'inhibitors', 'cyclosporin', '...","['AbstractPreviously', 'the', 'cyclophilin', '...",biorxiv_medrxiv,cyclophilin inhibitors cyclosporin A CsA Alisp...


##Helper Function


In [None]:
def load_words(df, num, with_title = False):
  """import dataframe with number of what sample to choose,
  return a keyword (together with title or not) as strings
  and abstract (gold label for summarization).
  and 3 distractors. all as a tuple of 5 strings"""
  arr_distract = np.random.randint(len(df), size=3)
  keyword = df['keyword_POS_str'][num]
  if with_title:
    title = df['title'][num]
    keyword = title + keyword
  abstract = df['abstract'][num]
  distract1 = df['abstract'][arr_distract[0]]
  distract2 = df['abstract'][arr_distract[1]]
  distract3 = df['abstract'][arr_distract[2]]

  return (keyword,abstract,distract1,distract2,distract3)

In [None]:
def write_input_ids(word_batch,max_len=1024):#,max_len=1024):
  """
  tokenize the input
  then return list of input tokens
  """
  key, abstract, dis1,dis2,dis3 = word_batch

  if len(key)>max_len:
      key = key[:max_len]
  if len(abstract)>max_len:
      abstract = abstract[:max_len]
  if len(dis1)>max_len:
      dis1 = dis1[:max_len]
  if len(dis2)>max_len:
      dis2 = dis2[:max_len]
  if len(dis3)>max_len:
      dis3 = dis3[:max_len]
      
  input_true = tokenizer.encode('<|startoftext|> ' + key + ' <|summarize|> '+ abstract + ' <|endoftext|>',max_length = tokenizer.max_len)
  assert 50260 in input_true
  input_dis1 = tokenizer.encode('<|startoftext|> ' + key + ' <|summarize|> '+ dis1 + ' <|endoftext|>',max_length = tokenizer.max_len)
  input_dis2 = tokenizer.encode('<|startoftext|> ' + key + ' <|summarize|> '+ dis2 + ' <|endoftext|>',max_length = tokenizer.max_len)
  input_dis3 = tokenizer.encode('<|startoftext|> ' + key + ' <|summarize|> '+ dis3 + ' <|endoftext|>',max_length = tokenizer.max_len)
  
  if max_len == None:
    max_len = max(len(input_true),len(input_dis1),len(input_dis2),len(input_dis3))

  for i in [input_true,input_dis1,input_dis2,input_dis3]:
    while len(i) < max_len:
      i.append(tokenizer.pad_token_id)
  list_input_token = [input_true,input_dis1,input_dis2,input_dis3]
  return list_input_token

In [None]:
def write_token_type_labels(list_input_ids,max_len=1024):
  '''
  creating segment tokens
  (either keyword, summary, or padding segment)

  list_input_ids: [input_true,input_dis1,input_dis2,input_dis3]
  '''
  list_segment = []
  for item in list_input_ids:
    try:
      item.index(tokenizer.eos_token_id)
    except:
      item[-1] = tokenizer.eos_token_id
    num_seg_a = item.index(tokenizer.additional_special_tokens_ids[1]) + 1
    end_index = item.index(tokenizer.eos_token_id)
    num_seg_b = end_index - num_seg_a + 1
    num_pad = max_len - end_index - 1
    segment_ids = [tokenizer.additional_special_tokens_ids[0]]*num_seg_a + [tokenizer.additional_special_tokens_ids[1]]*num_seg_b + [tokenizer.pad_token_id]*num_pad
    list_segment.append(segment_ids)
  return list_segment

In [None]:
def write_lm_labels(list_input_ids,list_type_labels):
  '''
  write the label for the lm head 
  (only on the correct pair; i.e. the 0th element of the list; 
  else where is masked with [-100] token to prevent the model from 
  computing cross entropy loss).

  '''
  list_lm_label = []
  is_true_label = True
  for input_tokens,segments in zip(list_input_ids,list_type_labels): #by each summary
    if is_true_label:
      is_true_label = False
      temp_list = []
      for token,segment in zip(input_tokens,segments):
        if segment == tokenizer.additional_special_tokens_ids[1]:#only abstract!!
          temp_list.append(token)
        else:
          temp_list.append(-100) #if not abstract (i.e. keyword, padding)
      list_lm_label.append(temp_list)
    else:
      temp_list = [-100]*len(input_tokens) #if not true abstract, put [-100]
      list_lm_label.append(temp_list)
  return list_lm_label

In [None]:
def write_last_token(list_input_ids):
  '''
  return the last token before padding (i.e. the <|endoftext|>) 
  (this is recognized by the mc head for the multiple choice loss)
  '''
  list_mc_token = []
  for item in list_input_ids:
    list_mc_token.append(item.index(tokenizer.eos_token_id))
  return list_mc_token

In [None]:
def write_mc_label():
  '''
  return [1,0,0,0] because the correct pair is always the 0th element of the list
  '''
  return [1,0,0,0]

In [None]:
def shuffle_batch(list_input_ids,list_type_labels,list_last_tokens,list_lm_labels,list_mc_labels):
  '''
  shuffle the tuple. After this the correct pair can be any element. return numpy array
  '''
  array_input_token = np.array(list_input_ids)
  array_segment = np.array(list_type_labels)
  array_mc_token = np.array(list_last_tokens)
  array_lm_label = np.array(list_lm_labels)
  array_mc_label = np.array(list_mc_labels)

  randomize = np.arange(4)
  np.random.shuffle(randomize)

  array_input_token = array_input_token[randomize]
  array_segment = array_segment[randomize]
  array_mc_token = array_mc_token[randomize]
  array_lm_label = array_lm_label[randomize]
  array_mc_label = array_mc_label[randomize]

  return (array_input_token,array_segment,array_mc_token,array_lm_label,array_mc_label)

In [None]:
def write_torch_tensor(np_batch):
  '''
  create a tensor object from the numpy array
  '''
  torch_input_token = torch.tensor(np_batch[0], dtype=torch.long).unsqueeze(0)
  torch_segment = torch.tensor(np_batch[1],dtype=torch.long).unsqueeze(0)
  torch_mc_token = torch.tensor(np_batch[2],dtype=torch.long).unsqueeze(0)
  torch_lm_label = torch.tensor(np_batch[3],dtype=torch.long).unsqueeze(0)
  torch_mc_label = torch.tensor([np.argmax(np_batch[4])],dtype=torch.long).unsqueeze(0)
  return (torch_input_token,torch_segment,torch_mc_token,torch_lm_label,torch_mc_label)

##write a big function

execute all 8 helper functions

concatenate the temporary tensor object every 1000 items.

This is done for the sake of time efficiency. concat tensor on a very long tensor takes a bit of time.

In [None]:

def execute_all_function(df):
  exist_temp_tensor = False
  exist_big_tensor = False
  start = timeit.default_timer()
  for num in range(len(df)): #32146
    word_tuple = load_words(df, num) #(keyword,abstract,distract1,distract2,distract3)
    if type(word_tuple[0]) != str or type(word_tuple[1]) != str:
      # continue returns the control to the beginning of the loop
      continue
    
    list_input_ids = write_input_ids(word_tuple) #output: [input_true,input_dis1,input_dis2,input_dis3]
    list_type_labels = write_token_type_labels(list_input_ids) #[input_true segment_ids, input_dis1 segment_ids, input_dis2 segment_ids, input_dis3 segment_ids]
    list_lm_labels = write_lm_labels(list_input_ids,list_type_labels) #[input_true lm_ids, input_dis1 lm_ids, input_dis2 lm_ids, input_dis3 lm_ids]
    
    # for each keyword+summary, get the index of <eos> token
    list_last_tokens = write_last_token(list_input_ids)
    list_mc_labels = write_mc_label() #return [1,0,0,0]

    np_tuple = shuffle_batch(list_input_ids,list_type_labels,list_last_tokens,list_lm_labels,list_mc_labels)
    tensor_tuple = write_torch_tensor(np_tuple)
    
    if not exist_temp_tensor:
      temp_0 = tensor_tuple[0]
      temp_1 = tensor_tuple[1]
      temp_2 = tensor_tuple[2]
      temp_3 = tensor_tuple[3]
      temp_4 = tensor_tuple[4]
      exist_temp_tensor = True
    elif exist_temp_tensor:
      temp_0 = torch.cat((temp_0,tensor_tuple[0]),0)
      temp_1 = torch.cat((temp_1,tensor_tuple[1]),0)
      temp_2 = torch.cat((temp_2,tensor_tuple[2]),0)
      temp_3 = torch.cat((temp_3,tensor_tuple[3]),0)
      temp_4 = torch.cat((temp_4,tensor_tuple[4]),0)

    if num % 1000 == 0:
      if not exist_big_tensor:
        big_first_tensor = temp_0
        big_second_tensor = temp_1
        big_third_tensor = temp_2
        big_fourth_tensor = temp_3
        big_fifth_tensor = temp_4
        exist_temp_tensor = False
        exist_big_tensor = True
        del temp_0,temp_1,temp_2,temp_3,temp_4
      else:
        big_first_tensor = torch.cat((big_first_tensor,temp_0),0)
        big_second_tensor = torch.cat((big_second_tensor,temp_1),0)
        big_third_tensor = torch.cat((big_third_tensor,temp_2),0)
        big_fourth_tensor = torch.cat((big_fourth_tensor,temp_3),0)
        big_fifth_tensor = torch.cat((big_fifth_tensor,temp_4),0)
        exist_temp_tensor = False
        del temp_0,temp_1,temp_2,temp_3,temp_4
      
      stop = timeit.default_timer()
      print('iterations ',num,' takes ', stop - start,' sec')
      start = timeit.default_timer()
  
  big_first_tensor = torch.cat((big_first_tensor,temp_0),0)
  big_second_tensor = torch.cat((big_second_tensor,temp_1),0)
  big_third_tensor = torch.cat((big_third_tensor,temp_2),0)
  big_fourth_tensor = torch.cat((big_fourth_tensor,temp_3),0)
  big_fifth_tensor = torch.cat((big_fifth_tensor,temp_4),0)
  return big_first_tensor, big_second_tensor, big_third_tensor,big_fourth_tensor,big_fifth_tensor

In [None]:
tensor_1,tensor_2,tensor_3,tensor_4,tensor_5 = execute_all_function(frame_dev)

iterations  0  takes  0.06450268200001119  sec
iterations  1000  takes  20.634436464999965  sec
iterations  2000  takes  19.52589998000002  sec
iterations  3000  takes  19.31977676500003  sec
iterations  4000  takes  19.328844215000004  sec
iterations  5000  takes  19.185199529000045  sec
iterations  6000  takes  19.19430839800009  sec
iterations  7000  takes  19.275529386000017  sec
iterations  8000  takes  19.015346636000004  sec
iterations  9000  takes  19.10094117199992  sec
iterations  10000  takes  19.225099813000043  sec
iterations  11000  takes  19.095336440999972  sec
iterations  12000  takes  19.081980534999957  sec
iterations  13000  takes  19.247761438999987  sec
iterations  14000  takes  19.104330066999978  sec
iterations  15000  takes  19.28937768700007  sec
iterations  16000  takes  19.363567287000023  sec
iterations  17000  takes  19.430583260999924  sec
iterations  18000  takes  19.314859392000017  sec
iterations  19000  takes  19.471804277000047  sec
iterations  20000

In [None]:
# create a tensor dataset object
tensor_dataset = TensorDataset(tensor_1,tensor_2,tensor_3,tensor_4,tensor_5)

In [None]:
# save the tensor object to load later when training
torch.save(tensor_dataset, 'torch_devFile_1_May07_2020.pt')

##check your result by printing statement
Make sure the labels are all correct and lined up

In [None]:
tensor_5.numpy() #list_mc_labels

array([[3],
       [0],
       [1],
       ...,
       [3],
       [2],
       [3]])

In [None]:
item = 1515
print(tensor_1[item])
print(tensor_2[item])
print(tensor_3[item])
print(tensor_4[item])
print(tensor_5[item])

tensor([[50257, 46631, 23674,  ..., 50258, 50258, 50258],
        [50257, 46631, 23674,  ..., 50258, 50258, 50258],
        [50257, 46631, 23674,  ..., 50258, 50258, 50258],
        [50257, 46631, 23674,  ..., 50258, 50258, 50258]])
tensor([[50259, 50259, 50259,  ..., 50258, 50258, 50258],
        [50259, 50259, 50259,  ..., 50258, 50258, 50258],
        [50259, 50259, 50259,  ..., 50258, 50258, 50258],
        [50259, 50259, 50259,  ..., 50258, 50258, 50258]])
tensor([395, 328, 377, 356])
tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100]])
tensor([1])


In [None]:
print('{:>2}{:>10}{:>10}{:>10}{:>10}{:>20}{:>10}{:>20}{:>10}'.format('count','input','decoded input','input','decoded input','input','decoded input','input','decoded input'))
count = 0
for i,j,k,m in zip(tensor_1[item][1],tensor_1[item][2],tensor_2[item][2],tensor_4[item][2]):
  i = int(i)
  j = int(j)
  k = int(k)
  m = int(m)
  if i == -100:
    decode_i = 'masked'
  else:
    decode_i = tokenizer.decode(i)
  if j == -100:
    decode_j = 'masked'
  else:
    decode_j = tokenizer.decode(j)
  if k == -100:
    decode_k = 'masked'
  else:
    decode_k = tokenizer.decode(k)
  if m == -100:
    decode_m = 'masked'
  else:
    decode_m = tokenizer.decode(m)
  #print(i,j)
  print('{:>2}{:>10}{:>10}{:>10}{:>10}{:>20}{:>10}{:>20}{:>10}'.format(count,i,decode_i,j,decode_j,k,decode_k,m,decode_m))
  count += 1

count     inputdecoded input     inputdecoded input               inputdecoded input               inputdecoded input
 0     50257<|startoftext|>     50257<|startoftext|>               50259<|keyword|>                -100    masked
 1     46631     Influ     46631     Influ               50259<|keyword|>                -100    masked
 2     23674      enza     23674      enza               50259<|keyword|>                -100    masked
 3       317         A       317         A               50259<|keyword|>                -100    masked
 4     20547   viruses     20547   viruses               50259<|keyword|>                -100    masked
 5       389       are       389       are               50259<|keyword|>                -100    masked
 6     12077   amongst     12077   amongst               50259<|keyword|>                -100    masked
 7      9389 challenging      9389 challenging               50259<|keyword|>                -100    masked
 8     20547   viruses     20547   v

# part2: fine-tune-GPT2-for-summarization




##0. Using Ignite to train GPT2 summarization¶
This notebook illustrate how to use Ignite Engine to train GPT2 for abstractive summarization. The goal here is to get a fine-tuned weight tensors of GPT2 that we will later use for abstractive summarization of biomedical science publication. The dataset is processed from this kaggle.

To get a sense on how to train GPT2, and why it is done this way, and what an outcome looks like, read this companion notebook here



##1. installing Pytorch, Huggingface, check GPU, etc.¶


In [None]:
%cd '/content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021'

/content/drive/.shortcut-targets-by-id/17GW73wWB3WJJKQXTdFCaAZX60y4ccAfV/GPT2_Jan28_2021


In [None]:
!nvidia-smi

Tue Feb  2 14:56:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8    11W /  70W |     10MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import timeit
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler


SEED = 1234
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
!pip install transformers==2.6.0



In [None]:
import transformers
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel, AdamW
load_model = False
load_previous_weight = False
resize_model = False
print(transformers.__version__) # make sure it is 2.6.0

2.6.0


##2. Test load the GPT2DoubleHeadsModel¶


In [None]:
model = GPT2DoubleHeadsModel.from_pretrained('/content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021')
load_model = True

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('/content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021')

In [None]:
print(len(tokenizer), 'total length of vocab') # expect 50257

50257 total length of vocab


In [None]:
# Add a [CLS] to the vocabulary (we should train it also!)
special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','pad_token':'<pad>','additional_special_tokens':['<|keyword|>','<|summarize|>']}
#special_tokens2 = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','keyword_token':'<|keyword|>','summary_token':'<|summarize|>'}
tokenizer.add_special_tokens(special_tokens)
#model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
# The newly token the last token of the vocabulary
resize_model = True

In [None]:
print(len(tokenizer), 'total length of vocab')
print(tokenizer.bos_token_id, 'bos_token')
print(tokenizer.eos_token_id, 'eos_token')
print(tokenizer.pad_token_id, 'pad_token')  #token for <pad>, len of all tokens in the tokenizer
print(tokenizer.additional_special_tokens_ids[0], 'keyword_token') #token for <|keyword|>
print(tokenizer.additional_special_tokens_ids[1], 'summary_token') #token for <|summarize|>

50261 total length of vocab
50257 bos_token
50256 eos_token
50258 pad_token
50259 keyword_token
50260 summary_token


##3. Load dataset and make dataloader

The dataset is in the torch tensor format. It is bundled into a tuple of 5 items, which are

1. the input tokens.
2. the segment tokens.
3. the index for last token (this is used for multiple choice),
4. the language model expected output tokens, the masked [-100] is used to mask away part that model doesn't have to output.

  - this 1-4 items come in a batch of 4, only one of these 4 is the correct keyword-summary pair. The other 3 are distractors.
5. the multiple choice label which one of the 4 item in the current batch is the correct choice.

In [None]:
train_dataset_1 = torch.load('/content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021/post_processed_train_tensor.pt')


In [None]:
len(train_dataset_1)
train_dataset_1[5]

(tensor([[50257, 21918,  2708,  ..., 50258, 50258, 50258],
         [50257, 21918,  2708,  ..., 50258, 50258, 50258],
         [50257, 21918,  2708,  ..., 50258, 50258, 50258],
         [50257, 21918,  2708,  ..., 50258, 50258, 50258]]),
 tensor([[50259, 50259, 50259,  ..., 50258, 50258, 50258],
         [50259, 50259, 50259,  ..., 50258, 50258, 50258],
         [50259, 50259, 50259,  ..., 50258, 50258, 50258],
         [50259, 50259, 50259,  ..., 50258, 50258, 50258]]),
 tensor([394, 487,  71,  71]),
 tensor([[-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100]]),
 tensor([2]))

In [None]:
for count,i in enumerate(train_dataset_1[5][3][0]):
  i = int(i)
  if i == -100:
    decode_i = 'masked'
  else:
    decode_i = tokenizer.decode(i)
  print(count,int(i), decode_i)

0 -100 masked
1 -100 masked
2 -100 masked
3 -100 masked
4 -100 masked
5 -100 masked
6 -100 masked
7 -100 masked
8 -100 masked
9 -100 masked
10 -100 masked
11 -100 masked
12 -100 masked
13 -100 masked
14 -100 masked
15 -100 masked
16 -100 masked
17 -100 masked
18 -100 masked
19 -100 masked
20 -100 masked
21 -100 masked
22 -100 masked
23 -100 masked
24 -100 masked
25 -100 masked
26 -100 masked
27 -100 masked
28 -100 masked
29 -100 masked
30 -100 masked
31 -100 masked
32 -100 masked
33 -100 masked
34 -100 masked
35 -100 masked
36 -100 masked
37 -100 masked
38 -100 masked
39 -100 masked
40 -100 masked
41 -100 masked
42 -100 masked
43 -100 masked
44 -100 masked
45 -100 masked
46 -100 masked
47 -100 masked
48 -100 masked
49 -100 masked
50 -100 masked
51 -100 masked
52 -100 masked
53 -100 masked
54 -100 masked
55 -100 masked
56 -100 masked
57 -100 masked
58 -100 masked
59 -100 masked
60 -100 masked
61 -100 masked
62 -100 masked
63 -100 masked
64 -100 masked
65 -100 masked
66 -100 masked
67 -1

In [None]:
train1_sampler = RandomSampler(train_dataset_1)
train1_dataloader = DataLoader(train_dataset_1, sampler=train1_sampler, batch_size=1)

In [None]:
val_dataset_1 = torch.load('/content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021/post_processed_val_tensor.pt')
val1_sampler = RandomSampler(val_dataset_1)
val1_dataloader = DataLoader(val_dataset_1, sampler=val1_sampler, batch_size=1)

##4. Test run

In [None]:
input_ids, token_type_ids, mc_token_ids, lm_labels, mc_labels = train_dataset_1[0]
print(input_ids.shape)
print(token_type_ids.shape)
print(mc_token_ids.shape)
print(lm_labels.shape)
print(mc_labels.shape)

torch.Size([4, 1024])
torch.Size([4, 1024])
torch.Size([4])
torch.Size([4, 1024])
torch.Size([1])


In [None]:
model = model.to(device)
optimizer = AdamW(model.parameters(),lr=3e-5,eps=1e-8, correct_bias=True)
max_norm = 1.0

In [None]:
gradient_accumulation_steps = 10

In [None]:
total_steps = len(train1_dataloader)
print('total step for learning rate scheduler = ',total_steps)

total step for learning rate scheduler =  31974


In [None]:
from transformers import get_linear_schedule_with_warmup
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 50, num_training_steps = total_steps)

In [None]:
test_run = train_dataset_1[1]

In [None]:
# Forward pass
start = timeit.default_timer()
model.train()
optimizer.zero_grad()
test_run = (item.to(device) for item in test_run)
input_ids, token_type_ids, mc_token_ids, lm_labels, mc_labels = test_run
#input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = input_ids.to(device), mc_token_ids.to(device), lm_labels.to(device), mc_labels.to(device), token_type_ids.to(device)
outputs = model(input_ids = input_ids, mc_token_ids = mc_token_ids, mc_labels = mc_labels, lm_labels = lm_labels, token_type_ids = token_type_ids)
lm_loss, mc_loss = outputs[0], outputs[1]

lm_coef = 2.0
mc_coef = 1.0

total_loss = lm_loss * lm_coef + mc_loss * mc_coef
print('lm_loss = ',lm_loss.item())
print('mc_loss = ',mc_loss.item())
print('total_loss = ',total_loss.item())
total_loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
optimizer.step()
stop = timeit.default_timer()
print('1 epoch takes {:.3f}'.format(stop - start),' sec')

lm_loss =  1.4279698133468628
mc_loss =  0.0
total_loss =  2.8559396266937256
1 epoch takes 1.043  sec


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


##5. set up Ignite

In [None]:
!pwd

/content/drive/.shortcut-targets-by-id/17GW73wWB3WJJKQXTdFCaAZX60y4ccAfV/GPT2_Jan28_2021


In [None]:
!pip install pytorch-ignite

In [None]:
from ignite.engine import Engine, Events
from ignite.metrics import MeanSquaredError, Loss, RunningAverage
from ignite.handlers import ModelCheckpoint, EarlyStopping

In [None]:
def process_function(engine,batch):
  #start = timeit.default_timer()
  model.train()
  #optimizer.zero_grad()
  batch = (item.to(device) for item in batch)
  input_ids, token_type_ids, mc_token_ids, lm_labels, mc_labels = batch
  outputs = model(input_ids = input_ids, mc_token_ids = mc_token_ids, mc_labels = mc_labels,
                  lm_labels = lm_labels, token_type_ids = token_type_ids)
  lm_loss, mc_loss = outputs[0], outputs[1]
  #del input_ids, token_type_ids, mc_token_ids, lm_labels, mc_labels
  lm_coef = 2.0
  mc_coef = 1.0
  total_loss = lm_loss * lm_coef + mc_loss * mc_coef
  total_loss = total_loss / gradient_accumulation_steps
  total_loss.backward()
  torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
  if engine.state.iteration % gradient_accumulation_steps == 0:
    optimizer.step()
    optimizer.zero_grad()
  scheduler.step()
  return lm_loss.item(),mc_loss.item(),total_loss.item()*gradient_accumulation_steps

In [None]:
def evaluate_function(engine,batch):
  model.eval()
  with torch.no_grad():
    batch = (item.to(device) for item in batch)
    input_ids, token_type_ids, mc_token_ids, lm_labels, mc_labels = batch
    outputs = model(input_ids = input_ids, mc_token_ids = mc_token_ids, mc_labels = mc_labels,
                  lm_labels = lm_labels, token_type_ids = token_type_ids)
    lm_loss, mc_loss = outputs[0], outputs[1]
    lm_coef = 2.0
    mc_coef = 1.0
    total_loss = lm_loss * lm_coef + mc_loss * mc_coef
  return lm_loss.item(),mc_loss.item(),total_loss.item()

In [None]:
trainer = Engine(process_function)
evaluator = Engine(evaluate_function)

training_history = {'lm_loss': [], 'mc_loss': [], 'total_loss': []}
validation_history = {'lm_loss': [], 'mc_loss': [], 'total_loss': []}

In [None]:
RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'lm_loss')
RunningAverage(output_transform=lambda x: x[1]).attach(trainer, 'mc_loss')
RunningAverage(output_transform=lambda x: x[2]).attach(trainer, 'total_loss')

In [None]:
RunningAverage(output_transform=lambda x: x[0]).attach(evaluator, 'lm_loss')
RunningAverage(output_transform=lambda x: x[1]).attach(evaluator, 'mc_loss')
RunningAverage(output_transform=lambda x: x[2]).attach(evaluator, 'total_loss')

In [None]:
@trainer.on(Events.ITERATION_COMPLETED(every=100))
def print_trainer_logs(engine):
    # try:
    #   start
    # except:
    #   start = timeit.default_timer()
    loss_LM = engine.state.metrics['lm_loss']
    loss_NSP = engine.state.metrics['mc_loss']
    combined_loss = engine.state.metrics['total_loss']
    stop = timeit.default_timer()
    print("Trainer Results - iteration {} - LM loss: {:.2f} MC loss: {:.2f} total loss: {:.2f} report time: {:.1f}"
    .format(engine.state.iteration, loss_LM, loss_NSP, combined_loss,stop))

In [None]:
checkpointer = ModelCheckpoint('/content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021/GPT2_dir', 'GPT2_summarizer', n_saved=2, create_dir=True, save_as_state_dict=True,require_empty=False)
trainer.add_event_handler(Events.ITERATION_COMPLETED(every=15000), checkpointer, {'epoch_2': model})
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'epoch_2_done': model})

<ignite.engine.events.RemovableEventHandle at 0x7fd6752a7438>

In [None]:
def print_logs(engine, dataloader, mode, history_dict):
    evaluator.run(dataloader, max_epochs=1)
    metrics = evaluator.state.metrics
    avg_LM_loss = metrics['lm_loss']
    avg_NSP_loss = metrics['mc_loss']
    avg_total_loss = metrics['total_loss']
    #avg_loss =  avg_bce + avg_kld
    print(
        mode + " Results - Epoch {} - Avg lm_loss: {:.2f} Avg mc_loss: {:.2f} Avg total_loss: {:.2f}"
        .format(engine.state.epoch, avg_LM_loss, avg_NSP_loss, avg_total_loss))
    for key in evaluator.state.metrics.keys():
        history_dict[key].append(evaluator.state.metrics[key])

trainer.add_event_handler(Events.EPOCH_COMPLETED, print_logs, val1_dataloader, 'Validation', validation_history)

<ignite.engine.events.RemovableEventHandle at 0x7fd675298b70>

##Run Ignite Engine

In [None]:
e = trainer.run(train1_dataloader, max_epochs=1)


#part3: generate-summary-with-BERT-or-GPT2




In [None]:
!pip install -r /content/drive/MyDrive/Covid_Summary/requirements.txt



In [None]:
!python /content/drive/MyDrive/Covid_Summary/GPT2_summarize.py --input_file='/content/drive/MyDrive/Covid_Summary/covid.txt' --model_directory='/content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021'



##Generating a summary of COVID19 publication¶
This notebook introduces a NLP approach to summarize a and paraphrase a scientific publication. The model is specifically trained on COVID19 related data released as part of the COVID-19 Open Research Dataset Challenge on Kaggle.

The strategy we'll be using here involve

1. extract sentences using BERT + clustering
2. extract keyword tokens from the extracted sentences using BERT fine-tuned for token classification
3. generate a paraphrases from the extracted keywords using GPT2 fine-tuned for making abstractive summarization from keywords


The fine-tuning is already done, so we will load the model weights to perform the task. GPU is not necessary for this task,but it should help speed things up a bit, especially at the GPT2 sentence generation step.



In [None]:
#title Setup Environment and helper function
# Pip install Huggingface transformers

# if cuda is available, set device = 'cuda'

# setup pytorch environment

!pip install transformers==2.6.0

import transformers
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel, DistilBertModel, DistilBertTokenizer, BertTokenizer, BertForTokenClassification
import numpy as np

import nltk
nltk.download('punkt')
from nltk import sent_tokenize
%tensorflow_version 1.x
from keras.preprocessing.sequence import pad_sequences

from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

import json
import matplotlib.pyplot as plt
import timeit
import torch
import textwrap
wrapper = textwrap.TextWrapper(width=70)
SEED = 1234
torch.manual_seed(SEED)


In [11]:

# change directory to where to models are kept
# make sure this dir contain sub dirs for fine-tuned BERT and GPT2 models

%cd '/content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021'

/content/drive/.shortcut-targets-by-id/17GW73wWB3WJJKQXTdFCaAZX60y4ccAfV/GPT2_Jan28_2021


##A bit about the model

1. For sentence extraction, we can just use pre-trained distil-bert. this will speed things up even faster.

2. For token classification, we will be using BERT-based-cased

3. For the GPT2 model, we'll be using GPT2DoubleHead model. The "DoubleHead" of the model means the model is trained on both language modeling and multiple choice sentence prediction, and outputs 2 losses, the LM loss or language modeling loss, and MC loss or multiple choice loss.

In [16]:
#@title Choose Model Config and Weights

#@markdown Distil version is fine for this task
BERT_pretrained_weights = 'distilbert-base-uncased' #@param ["distilbert-base-uncased", "bert-base-uncased", "bert-base-cased"] {allow-input: true}

#@markdown for token classification we used 
BERTforTokenClassification_config_directory = '/content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021' #@param {type:"string"}
token_label_files = '/content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021/vocab.json' #@param {type:"string"}

GPT2_config_directory = '/content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021' #@param {type:"string"}

print('which BERT pre-trained ? ',BERT_pretrained_weights)
print('where is BERT token classifier dir ? ',BERTforTokenClassification_config_directory)
print('where is GPT2 dir ? ',GPT2_config_directory)


which BERT pre-trained ?  distilbert-base-uncased
where is BERT token classifier dir ?  /content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021
where is GPT2 dir ?  /content/drive/MyDrive/Covid_Summary/GPT2_Jan28_2021


In [17]:
#@title Load models and tokenizers
#@markdown the models are big, these may take a few mins, read [here](https://huggingface.co/transformers/serialization.html) for more information

print('----loading pre-trained BERT----')
BERT_pretrained = DistilBertModel.from_pretrained(
                  BERT_pretrained_weights)
tokenizer_pretrained = DistilBertTokenizer.from_pretrained(
                  BERT_pretrained_weights)
print('----loading token labels----')
with open(token_label_files, 'r') as fp:
    POS2idx = json.load(fp)

POS_values = list(POS2idx.keys())
print('----loading BERT token classifier----')
BERT_token_classifier = BertForTokenClassification.from_pretrained(
                      BERTforTokenClassification_config_directory)
tokenizer_token_classifier = BertTokenizer.from_pretrained(
                      BERTforTokenClassification_config_directory)
#BERT_token_classifier.load_state_dict(torch.load(BERTforTokenClassification_finetuned_weights))
print('----loading GPT2 summary generator----')
tokenizer_GPT2 = GPT2Tokenizer.from_pretrained(
                  GPT2_config_directory)
special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','pad_token':'<pad>','additional_special_tokens':['<|keyword|>','<|summarize|>']}
tokenizer_GPT2.add_special_tokens(special_tokens)
GPT2_generator = GPT2DoubleHeadsModel.from_pretrained(
                  GPT2_config_directory)

----loading pre-trained BERT----
----loading token labels----
----loading BERT token classifier----


OSError: ignored

In [None]:
#@title use GPU?

#@markdown check the box to indicate if GPU to be used for running any model?

use_GPU_BERT_pre_trained = False #@param {type:"boolean"}
use_GPU_BERT_token_classifier = False #@param {type:"boolean"}
use_GPU_GPT_generator = True #@param {type:"boolean"}

if torch.cuda.is_available():
  print('cuda is available')
  device = 'cuda'
  print('device is set to cuda')
if not torch.cuda.is_available():
  print('cuda is not available')
  device = 'cpu'
  print('device is set to cpu')
  use_GPU_BERT_pre_trained = False
  use_GPU_BERT_token_classifier = False
  use_GPU_GPT_generator = False

print(' ')
print('use GPU for pre-trained BERT?' ,use_GPU_BERT_pre_trained)
print('use GPU for BERT token classifier ?' ,use_GPU_BERT_token_classifier)
print('use GPU for GPT2?' ,use_GPU_GPT_generator)

In [None]:
#@title Main text file

#@markdown indicate the text file to be summarized
use_input_text = True

input_file = '/content/drive/MyDrive/Covid_Summary/covid.txt' #@param {type:"string"}
max_len = 500 #@param {type:"integer",max:512}

#@markdown or copy paste your input here and check the box
use_input_text = False #@param {type:"boolean"}
input_text = "'Two months after it was firstly reported, the novel coronavirus disease COVID-19 has already spread worldwide. However, the vast majority of reported infections have occurred in China. To assess the effect of early travel restrictions adopted by the health authorities in China, we have implemented an epidemic metapopulation model that is fed with mobility data corresponding to 2019 and 2020. This allows to compare two radically different scenarios, one with no travel restrictions and another in which mobility is reduced by a travel ban. Our findings indicate that i) travel restrictions are an effective measure in the short term, however, ii) they are ineffective when it comes to completely eliminate the disease. The latter is due to the impossibility of removing the risk of seeding the disease to other regions. Our study also highlights the importance of developing more realistic models of behavioral changes when a disease outbreak is unfolding.'" #@param {type:"string"}

if not use_input_text:
  # open the txt file that is included
  with open(input_file, 'r') as file:
    input_text = file.read().replace('\n', '')

# split text to sentences
paragraph_split = sent_tokenize(input_text)

print('input text has',len(paragraph_split) ,'sentences.')

print('tokenizing sentences')

input_tokens = []
for i in paragraph_split:
  input_tokens.append(tokenizer_pretrained.encode(i, 
                              add_special_tokens=True))
temp = []
for i in input_tokens:
  temp.append(len(i))
if np.max(temp) > max_len:
  raise ValueError('sentence longer than the max_len')
if np.max(temp) > 512:
  print('warning: sentence longer than 512')
  print('suggest to change max_len to 512, the remainder will be truncated')
input_ids = pad_sequences(input_tokens, 
                          maxlen=max_len, dtype="long", 
                          value=0, 
                          truncating="post", 
                          padding="post")

print('creating attention masks')

attention_masks = []
for sent in input_ids:
  att_mask = [int(token_id > 0) for token_id in sent]  # create a list of 0 and 1.
  attention_masks.append(att_mask)  # basically attention_masks is a list of list

input_ids = torch.tensor(input_ids)  
attention_mask = torch.tensor(attention_masks)

In [18]:
#@title Keyword extraction

list_to_pick = ['NN','NNP','NNPS','NNS','VBD','VB','VBZ','VBP']

tokenized_sentence = tokenizer_token_classifier.encode(
                      topic_answer_string)
input_ids2 = torch.tensor([tokenized_sentence[:510]])

if use_GPU_BERT_token_classifier:
  BERT_token_classifier = BERT_token_classifier.to(device)
  input_ids2 = input_ids2.to(device)

if not use_GPU_BERT_token_classifier:
  BERT_token_classifier = BERT_token_classifier.to('cpu')
  input_ids2 = input_ids2.to('cpu')

with torch.no_grad():
  output2 = BERT_token_classifier(input_ids2)
label_indices = np.argmax(output2[0].to('cpu').numpy(), axis=2)

list_keywords = []

tokens = tokenizer_token_classifier.convert_ids_to_tokens(
                        input_ids2.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(POS_values[label_idx])
        new_tokens.append(token)
for token, label in zip(new_tokens, new_labels):
    if label in list_to_pick:
      list_keywords.append(token)

print('finished keyword extraction ...')
print('the keywords are')

list_keywords = [i for i in list_keywords if i not in ['[CLS]','[SEP]','?','/','-','.','_','!','@','[',']']]
list_keywords

list_keywords_str = ' '.join(list_keywords)
wrapper.wrap(list_keywords_str)

NameError: ignored