In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import pandas as pd
import pickle
import tensorflow as tf

from datasets import Dataset
from huggingface_hub import notebook_login
from transformers import AutoTokenizer
from transformers import create_optimizer
from transformers import DataCollatorForLanguageModeling
from transformers import DistilBertTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import pipeline
from transformers import TFAutoModelForMaskedLM
from transformers.keras_callbacks import PushToHubCallback

In [2]:
#notebook_login()

Initially, load the DistilBERT base model and tokenizer. 

In [3]:
model_checkpoint = "distilbert-base-uncased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForMaskedLM: ['activation_13']
- This IS expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


Check to see if GPUs are available. 

In [4]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

Open the pickled list of data scraped from geovernment reports. 

In [5]:
text_file = open("../convert.pkl", "rb")
text = pickle.load(text_file)
text_file.close()

These processing of data loading model training is all based upon the tutorial provided on https://huggingface.co/course/chapter7/3?fw=tf/

In [6]:
chunk_size = 512

def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [7]:
clean_text= []
for item in text:
    item.replace('\n', '')
    item.replace('\t', '')
    if sum(c.isdigit() for c in item)/len(item) > 0.1 or len(item) == 0: 
        clean_text.append(item)

text_df = pd.DataFrame({'text':clean_text})
dataset = Dataset.from_pandas(text_df)   

Check the number of reports that are predominantly clean text. 

In [8]:
len(dataset)

950

In [9]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
lm_datasets = tokenized_datasets.map(group_texts, batched=True)

Map:   0%|          | 0/950 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (90154 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/950 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

Check the number of resultant chunks that meet the criteria. 

In [12]:
len(lm_datasets)

51634

In [11]:
train_size = round(0.9*len(lm_datasets))
test_size = len(lm_datasets) - train_size

downsampled_dataset = lm_datasets.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)

In [11]:
tf_train_dataset = model.prepare_tf_dataset(
    downsampled_dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)
tf_eval_dataset = model.prepare_tf_dataset(
    downsampled_dataset["test"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [12]:
num_train_steps = len(tf_train_dataset)

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

#model_name = model_checkpoint.split("/")[-1]
#callback = PushToHubCallback(
#    output_dir=f"{model_name}-finetuned-imdb", tokenizer=tokenizer
#)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPUs will likely run quickly with dtype policy mixed_float16 as they all have compute capability of at least 7.0


In [13]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, batch_size=16, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa09408f2b0>

In [14]:
query = "The mineral [MASK] is found in the rock [MASK]"

In [15]:
unmasker = pipeline('fill-mask', model='distilbert-base-uncased')
unmasker("The mineral [MASK] is found in the rock [MASK].")

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

[[{'score': 0.03453386202454567,
   'token': 26427,
   'token_str': 'carbonate',
   'sequence': '[CLS] the mineral carbonate is found in the rock [MASK]. [SEP]'},
  {'score': 0.032245464622974396,
   'token': 4221,
   'token_str': '##ite',
   'sequence': '[CLS] the mineralite is found in the rock [MASK]. [SEP]'},
  {'score': 0.02590017393231392,
   'token': 19057,
   'token_str': 'chloride',
   'sequence': '[CLS] the mineral chloride is found in the rock [MASK]. [SEP]'},
  {'score': 0.023440266028046608,
   'token': 15772,
   'token_str': 'oxide',
   'sequence': '[CLS] the mineral oxide is found in the rock [MASK]. [SEP]'},
  {'score': 0.02240598201751709,
   'token': 3514,
   'token_str': 'oil',
   'sequence': '[CLS] the mineral oil is found in the rock [MASK]. [SEP]'}],
 [{'score': 0.18109388649463654,
   'token': 13197,
   'token_str': 'formations',
   'sequence': '[CLS] the mineral [MASK] is found in the rock formations. [SEP]'},
  {'score': 0.12895970046520233,
   'token': 22913,


In [16]:
unmasker = pipeline('fill-mask', model=model, tokenizer = tokenizer)
unmasker("The mineral [MASK] is found in the rock [MASK].")

[[{'score': 0.10689971596002579,
   'token': 3989,
   'token_str': '##ization',
   'sequence': '[CLS] the mineralization is found in the rock [MASK]. [SEP]'},
  {'score': 0.06298457831144333,
   'token': 4221,
   'token_str': '##ite',
   'sequence': '[CLS] the mineralite is found in the rock [MASK]. [SEP]'},
  {'score': 0.05126945674419403,
   'token': 15707,
   'token_str': '##ogy',
   'sequence': '[CLS] the mineralogy is found in the rock [MASK]. [SEP]'},
  {'score': 0.04947109520435333,
   'token': 4180,
   'token_str': 'content',
   'sequence': '[CLS] the mineral content is found in the rock [MASK]. [SEP]'},
  {'score': 0.027343131601810455,
   'token': 3430,
   'token_str': 'material',
   'sequence': '[CLS] the mineral material is found in the rock [MASK]. [SEP]'}],
 [{'score': 0.07103662192821503,
   'token': 4195,
   'token_str': 'formation',
   'sequence': '[CLS] the mineral [MASK] is found in the rock formation. [SEP]'},
  {'score': 0.06592454761266708,
   'token': 13197,
   '

In [17]:
model.save_pretrained("GeoDistilBERT.h5")