### BERT pre-trained models

https://huggingface.co/transformers/pretrained_models.html  

In [1]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM, logging

logging.set_verbosity_error()
logging.get_verbosity()

40

#### Model inference works fine on both CPUs and GPUs.  For fine-tuning GPUs are necessary.

In [2]:
if torch.cuda.is_available():
    use_device = 'GPU'
else:
    use_device = 'CPU'
    
# use_device = 'CPU'    
    
print("Use " + use_device)    

Use GPU


#### Copy files to local FS from GCP bucket

In [3]:
!mkdir -p /home/jupyter/data/bert
!mkdir -p /home/jupyter/data/transformers

In [4]:
# !gsutil -m cp -r -n 'gs://msca-bdp-data-open/bert' '/home/jupyter/data/'

#### By default Huggingface attempts to download pre-trained models and vocabularies into .cache directory in /root/.cache/
- We can alter this behavior by forcing all Huggingface models to go to the explicitly stated cache directory

In [5]:
!ls -l /root/.cache/

total 16
drwxr-xr-x 3 root root 4096 Jul 26 17:32 huggingface
drwxr-xr-x 2 root root 4096 Jun 16 17:35 matplotlib
drwxr-xr-x 1 root root 4096 Jul 23 15:30 pip


In [6]:
# Location for Huggingface cache directory
cache_dir = '/home/jupyter/data/transformers'

#### Note that the first token is always forced to be [CLS]

In [7]:
def get_mask_prediction(text):
    # Use BERT-base model
    model = BertForMaskedLM.from_pretrained('bert-base-uncased', cache_dir = cache_dir)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir = cache_dir)

    # Encode the text
    input_ids = tokenizer.encode(text, return_tensors='pt')

    # Get the index of the [MASK] token
    mask_index = torch.where(input_ids == tokenizer.mask_token_id)[1]

    # Forward pass through BERT model to get logits
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids)
        predictions = outputs[0]

    # Get the top 3 token predictions for each [MASK]
    all_predictions = []
    for index in mask_index:
        top_predictions = []
        probs = predictions[0, index.item()].softmax(dim=0)
        top_3_preds = probs.topk(3)
        for i, idx in enumerate(top_3_preds.indices):
            token = tokenizer.convert_ids_to_tokens([idx])[0]
            top_predictions.append(token)
        all_predictions.append(top_predictions)

    return all_predictions

def replace_mask_with_preds(text, predictions):
    for preds in predictions:
        for pred in preds:
            text_clean = text.replace('[CLS]', '').replace('[SEP]', '')
            text_replaced = text_clean.replace('[MASK]', pred, 1)
            print(text_replaced.strip())

In [8]:
maskedText = "[CLS] It is time to go [MASK], the class is over. [SEP]"

predictions = get_mask_prediction(maskedText)
replace_mask_with_preds(maskedText, predictions)

It is time to go home, the class is over.
It is time to go out, the class is over.
It is time to go back, the class is over.


In [9]:
maskedText = '''[CLS] TA in our NLP class is very [MASK]. [SEP]'''

predictions = get_mask_prediction(maskedText)
replace_mask_with_preds(maskedText, predictions)

TA in our NLP class is very good.
TA in our NLP class is very important.
TA in our NLP class is very different.


In [10]:
maskedText = '''[CLS] NLP class is so boring, I am falling [MASK]. [SEP]'''

predictions = get_mask_prediction(maskedText)
replace_mask_with_preds(maskedText, predictions)

NLP class is so boring, I am falling asleep.
NLP class is so boring, I am falling apart.
NLP class is so boring, I am falling behind.


In [11]:
maskedText = '''[CLS] I would like to buy a [MASK] iPhone. [SEP]'''

predictions = get_mask_prediction(maskedText)
replace_mask_with_preds(maskedText, predictions)

I would like to buy a new iPhone.
I would like to buy a pink iPhone.
I would like to buy a real iPhone.


In [12]:
maskedText = '''[CLS] It is so [MASK] outside, I have to wear a coat, hat and gloves. [SEP]'''

predictions = get_mask_prediction(maskedText)
replace_mask_with_preds(maskedText, predictions)

It is so cold outside, I have to wear a coat, hat and gloves.
It is so hot outside, I have to wear a coat, hat and gloves.
It is so warm outside, I have to wear a coat, hat and gloves.


In [13]:
maskedText = '''[CLS] It is so [MASK] outside, I am sweating. [SEP]'''

predictions = get_mask_prediction(maskedText)
replace_mask_with_preds(maskedText, predictions)

It is so hot outside, I am sweating.
It is so cold outside, I am sweating.
It is so warm outside, I am sweating.


In [14]:
maskedText = '''[CLS] It is so late my students are [MASK] to participate it the class [SEP]'''

predictions = get_mask_prediction(maskedText)
replace_mask_with_preds(maskedText, predictions)

It is so late my students are unable to participate it the class
It is so late my students are forced to participate it the class
It is so late my students are going to participate it the class


In [15]:
maskedText = '''[CLS] We [MASK] review the Final Project [SEP]'''

predictions = get_mask_prediction(maskedText)
replace_mask_with_preds(maskedText, predictions)

We will review the Final Project
We shall review the Final Project
We must review the Final Project


In [16]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Wed, 26 July 2023 12:36:44'