In [1]:
from transformers import BertModel, BertTokenizer, BertConfig, BertForMaskedLM
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import random
import logging
import math

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


In [2]:
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
random.seed(0)

In [3]:
# Load pre-trained model tokenizer (vocabulary) and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForMaskedLM.from_pretrained('bert-base-cased')
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [4]:
def tokenize_input(text):
    tokenized_text = tokenizer.tokenize(text)
    
    sentence1 = tokenized_text[:tokenized_text.index(".")+1]
    sentence2 = tokenized_text[tokenized_text.index(".")+1:]
    
    sentence1 = ["[CLS]"] + sentence1
    sentence2 = ["[SEP]"] + sentence2 + ["[SEP]"]
    
    segments_ids = [0]*len(sentence1) + [1]*len(sentence2)
    new_tokenized_text = sentence1 + sentence2
    
    #print(len(sentence1))
    
    return new_tokenized_text, segments_ids

In [8]:
def mask_tokens(tokenized_text, mask_token_ids):
    for ids in mask_token_ids:
        #print(tokenized_text[id])
        tokenized_text[ids] = '[MASK]'
                
    # Convert tokenized text to their index
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    
    return tokenized_text, indexed_tokens

In [9]:
def produce_new_sentence(masked_tokenize, mask_token_ids, predictions):
    predicted = []

    for i in mask_token_ids:
        predicted_index = torch.argmax(predictions[0, i]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

        predicted.append(predicted_token)
    
    print(mask_token_ids)
    print(predicted)

    for word in predicted:
        mask_index = masked_tokenize.index("[MASK]")
        masked_tokenize[mask_index] = word

    final = ' '.join(masked_tokenize).replace(" ##", '').replace("[CLS] ", "").replace("[SEP] ", "").replace("[SEP]", "")

    print(final)
    
    return final

In [10]:
text = """My countrymen and women of the radio audience Of the untold values of the radio, one is the great intimacy it has brought among our people. 
The greatest strength of the radio is the intimacy of its listeners."""

cycles = 40

#number_masked = int((len(tokenized_text) - period_loc+1) / 5)
number_masked = 1

tokenized_text, segments_ids = tokenize_input(text)

for c in range(cycles):
    period_loc = tokenized_text.index(".")
    
    mask_token_ids = sorted(random.sample(range(period_loc+2, len(tokenized_text)-1), number_masked))

    masked_tokenize, indexed_tokens = mask_tokens(tokenized_text, mask_token_ids)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    tokens_tensor = tokens_tensor.to('cuda')
    segments_tensors = segments_tensors.to('cuda')
    model.to('cuda')

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0]

    tokenized_text = produce_new_sentence(masked_tokenize, mask_token_ids, predictions)
    
    

[32]
['The']
My countrymen and women of the radio audience Of the untold values of the radio , one is the great intimacy it has brought among our people . The greatest strength of the radio is the intimacy of its listeners . 


TypeError: 'str' object does not support item assignment

In [35]:
# Round 2: Potentially Growing


def tokenize_input2(text):
    tokenized_text = tokenizer.tokenize(text)
    
    sentence1 = tokenized_text[:tokenized_text.index(".")+1]
    sentence2 = tokenized_text[tokenized_text.index(".")+1:]
    
    sentence1 = ["[CLS]"] + sentence1
    sentence2 = ["[SEP]"] + sentence2 + ["[SEP]"]
    
    new_tokenized_text = sentence1 + sentence2
        
    return new_tokenized_text, sentence1, sentence2


def mask_tokens(tokenized_text, mask_token_ids):
    for id in mask_token_ids:
        #print(f"{id} ie. {tokenized_text[id]} is masked")
        tokenized_text[id] = '[MASK]'
    
    #Convert tokenized text to their index
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    
    return tokenized_text, indexed_tokens


def predict_token(masked_tokenize, mask_token_ids, predictions):
    predicted = []

    for i in mask_token_ids:
        predicted_index = torch.argmax(predictions[0, i]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

        predicted.append(predicted_token)

    for word in predicted:
        mask_index = masked_tokenize.index("[MASK]")
        
        #print(f"Masked index: {mask_index} became {word}")
        masked_tokenize[mask_index] = word
        
        if word == "." and masked_tokenize[mask_index + 1] != "[SEP]":
            masked_tokenize.insert(mask_index + 1,"[SEP]")          
    
    return masked_tokenize

def run(text, cycles, mask_type, mask_value):
    
    tokenized_text, sentence1, sentence2 = tokenize_input2(text)
    period_loc = tokenized_text.index(".")
    segments_ids = [0]*len(sentence1) + [1]*len(sentence2)
    
    masked_tokenize_prior_length = 99999
    tokenized_text_length = len(tokenized_text)
    
    for c in range(cycles):
        random.seed(c)
        
        if mask_type == 'frac':
            number_masked = int((tokenized_text_length - period_loc+1) * mask_value)
        elif mask_type == 'val':
            number_masked = mask_value
            
        
        mask_token_ids = sorted(random.sample(range(period_loc+2, tokenized_text_length-1), number_masked))
        masked_tokenize, indexed_tokens = mask_tokens(tokenized_text, mask_token_ids)

        if len(tokenized_text) > tokenized_text_length:
            difference = len(tokenized_text) - tokenized_text_length
            segments_ids = segments_ids + [1]*difference
            tokenized_text_length += difference

        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        tokens_tensor = tokens_tensor.to('cuda')
        segments_tensors = segments_tensors.to('cuda')
        model.to('cuda')

        # Predict all tokens
        with torch.no_grad():
            outputs = model(tokens_tensor, token_type_ids=segments_tensors)
            predictions = outputs[0]

        tokenized_text = predict_token(masked_tokenize, mask_token_ids, predictions)

        #print(tokenized_text)
        #print("==============================")
    
    tokenized_text.remove("[CLS]")
    tokenized_text.remove("[SEP]")
    tokenized_text.remove("[SEP]")
    
    result = ' '.join(tokenized_text)
    
    return result

In [12]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')

def score(sentence):
    tokenize_input = gpt_tokenizer.tokenize(sentence)
    tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    
    loss=gpt_model(tensor_input, labels=tensor_input)[0].item()
    
    return math.exp(loss)

In [13]:
text = """All of which comprises a very substantial reserve in the country at the present time . The Reserve Bank of India has also made a large amount of deposits , which is a good thing ."""

In [14]:
score(text)

2.820022077396162

In [15]:
cycles = 100
number_masked = 2
result = run(text, cycles, 'frac', 0.2)
print(result)
score(result)

All of which comprises a very substantial reserve in the country at the present time . Although the government is not directly directly involved , in the present case , this is the the case .


2.820022077396162

In [17]:
cycles = 250
number_masked = 2
result = run(text, cycles, 'frac', 0.2)
print(result)
score(result)

All of which comprises a very substantial reserve in the country at the present time . However , it will not be separated from it like other other species if its range is fully developed .


2.820022077396162

In [18]:
cycles = 100
number_masked = 1
result = run(text, cycles, 'frac', 0.2)
print(result)
score(result)

All of which comprises a very substantial reserve in the country at the present time . Although the government is not directly directly involved , in the present case , this is the the case .


2.820022077396162

In [19]:
cycles = 500
number_masked = 2
result = run(text, cycles, 'frac', 0.2)
print(result)
score(result)

All of which comprises a very substantial reserve in the country at the present time . The rest of the park is not listed on the 2008 IUCN red list , because of a landslide .


10.86345698713058

In [20]:
cycles = 1000
number_masked = 2
result = run(text, cycles, 'frac', 0.2)
print(result)
score(result)

All of which comprises a very substantial reserve in the country at the present time . All the existing protected areas are being developed , and this will will encourage further development of the resources .


2.820022077396162

In [21]:
cycles = 100
number_masked = 3
result = run(text, cycles, 'frac', 0.2)
print(result)
score(result)

All of which comprises a very substantial reserve in the country at the present time . Although the government is not directly directly involved , in the present case , this is the the case .


2.820022077396162

In [22]:
cycles = 100
number_masked = 4
result = run(text, cycles, 'frac', 0.2)
print(result)
score(result)

All of which comprises a very substantial reserve in the country at the present time . Although the government is not directly directly involved , in the present case , this is the the case .


2.820022077396162

In [23]:
cycles = 100
number_masked = 5
result = run(text, cycles, 'frac', 0.2)
print(result)
score(result)

All of which comprises a very substantial reserve in the country at the present time . Although the government is not directly directly involved , in the present case , this is the the case .


2.820022077396162

In [87]:
# def score2(original, predicted):
#     tokenize_original = gpt_tokenizer.tokenize(original)
#     tensor_original = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_original)])
    
#     tokenize_predicted = gpt_tokenizer.tokenize(predicted)
#     tensor_predicted = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_predicted)])

#     loss_fct = torch.nn.CrossEntropyLoss()
    
#     print(tensor_original.squeeze())
    
#     #loss = loss_fct(tensor_predicted.squeeze(),tensor_original.squeeze()).data 

#     return math.exp(loss)

In [36]:
fh = open('../2.ModelingGPT2andXLNet/testing_data_run/combined_data/test_gpt2_results.txt', 'r').read().splitlines()
out = open('gpt2_BERT_result.txt', 'w')

for line in fh:
    try:
        line = line.split("|")
    except:
        continue
    
    
    if len(line) == 3:
        sentence_original = line[0] + ' ' + line[1]
        original_score = score(sentence_original)

        sentence_gpt_predicted = line[0] + ' ' + line[2]
        gpt_score = score(sentence_gpt_predicted)
        
        try:
            sentence_gpt_predicted_BERTSmashed = run(sentence_gpt_predicted, 100, 'val', 2)
        except:
            print(sentence_gpt_predicted)
        
        bert_sentence = sentence_gpt_predicted_BERTSmashed.split(".")[1]
        bert_score = score(sentence_gpt_predicted_BERTSmashed)

        #print(f"{sentence_original}: {original_score}")
        #print(f"{sentence_gpt_predicted}: {gpt_score}")
        #print(f"{sentence_gpt_predicted_BERTSmashed}: {gpt_score}")

        #print("##########################")

        output = f'<RES>{line[0]}|{line[1]}|{line[2]}|{bert_sentence}|{original_score}|{gpt_score}|{bert_score}\n'
        #print(output)

        out.write(output)
    

ValueError: Sample larger than population or is negative