In [4]:
## imports

from datasets import load_dataset
import json
import transformers
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from types import SimpleNamespace   

# config = {}
# config['epochs'] = 10
# config['batch_size'] = 8
# config['lr'] = .00001
# config['tokenizer_max_length'] = 512
# config['file_min_words'] = 8
# config['input_path'] = '../inputs/pubmed-targets-1'
# config['output'] = './pharma-text-model'
# config['tokenizer'] = 'allenai/biomed_roberta_base'
# config['input_model'] = 'allenai/biomed_roberta_base'

# conf = SimpleNamespace(**config)

# with open("./config/pharma-text.json", "w") as outfile:
#     json.dump(config, outfile)

In [5]:
## read config
config = {}
with open("./config/pharma-text.json", "r") as infile:
    config = json.load(infile)

conf = SimpleNamespace(**config)
print(conf)

namespace(batch_size=8, epochs=10, file_min_words=8, input_model='allenai/biomed_roberta_base', input_path='../inputs/pubmed-targets-1', lr=1e-05, output='./pharma-text-model', tokenizer='allenai/biomed_roberta_base', tokenizer_max_length=512)


In [None]:
## text dataset
import os
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = RobertaTokenizer.from_pretrained(conf.tokenizer)

model = RobertaForMaskedLM.from_pretrained(conf.input_model)

dataset = load_dataset(
    path='text',
    data_files=os.path.join(conf.input_path, '*')
)

if len(dataset['train']) == 0:
    raise FileNotFoundError(
        f"No text files were found under {conf.input_path}. Please update trainer/config/pharma-text.json to point to a directory with training data."
    )

# print(dir(dataset['train']))
# print(dataset['train'].items())

class dummy_data(transformers.TextDataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_ids = self.inputs[index]
        return {"input_ids": input_ids}

## filter out lines with fewer than some minimum words; they are likely not cohesive sentences
def filter_data(d, min_words=conf.file_min_words):
    return len(d['text'].split(' ')) >= min_words

dataset = dataset.filter(filter_data)
print(dataset['train'].data[:5])

# dataset.train_test_split(test_size=0.1)

tokenized_data = tokenizer(dataset['train']['text'], return_tensors="pt", truncation=True, padding=True)

print('Data size: ', len(tokenized_data['input_ids']))

data_new = dummy_data(tokenized_data['input_ids'])



In [20]:
## training parameters for retraining on text dataset
## Reference: https://towardsdatascience.com/transformers-retraining-roberta-base-using-the-roberta-mlm-procedure-7422160d5764

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir=conf.output,
    overwrite_output_dir=True,
    num_train_epochs=conf.epochs,
    per_device_train_batch_size=conf.batch_size,
    save_steps=200,
    save_total_limit=2,
    seed=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=data_new
)

In [None]:
## train the model
trainer.train()

trainer.save_model(conf.output)

In [None]:
## Verify model
from scipy.spatial.distance import cosine
from transformers import pipeline

tokenizer = RobertaTokenizer.from_pretrained(conf.tokenizer)

feature_extraction = pipeline(
    "feature-extraction",
    model=conf.output,
    tokenizer=tokenizer, 
    max_length=conf.tokenizer_max_length, 
    truncation=True
)

features = feature_extraction([
    "This book is interesting. I would read it", 
    "I read this Jeffrey Archer book yesterday.",
    "The latest batman movie is a waste of money."
])
embed1 = features[0][0][0]
embed2 = features[1][0][0]
embed3 = features[2][0][0]
print('first two cosine: ', cosine(embed1, embed2))
print('last two cosine: ', cosine(embed2, embed3))

# for txt in dataset['train']['text']:
features = feature_extraction(dataset['train']['text'][:5])
print('Features: ', len(features))
print('Features[0]', len(features[0]))
print('Features[0][0]', len(features[0][0]))
print('Features[0][0][0]', len(features[0][0][0]))
embed1 = features[0][0][0]
embed2 = features[1][0][0]
embed3 = features[-1][0][0]
print('first two cosine: ', cosine(embed1, embed2))
print('last two cosine: ', cosine(embed2, embed3))



In [13]:
## save embeddings to disk
from tqdm import tqdm
from table_trainer_utils import write_csv
import os
import torch

output_ids_path = '../features/ukopen-textids.list'
output_embed_path = '../features/ukopen-textfeatures.pt'

text_ids = []
text_input = []

for entry in tqdm(os.scandir(conf.input_path)):
    if entry.is_dir():
        continue
    name = entry.name
    text_ids.append(name)
    with open(os.path.join(conf.input_path, name), 'r') as f:
        text_input.append(f.readline())

write_csv(output_ids_path, text_ids)

features = feature_extraction(text_input)
print('Features: ', len(features))
print('Features[0]', len(features[0]))
print('Features[0][0]', len(features[0][0]))
print('Features[0][0][0]', len(features[0][0][0]))
text_embeds = torch.empty(
    (len(features), len(features[0][0][0])))

for i, feature in tqdm(enumerate(features)):
    text_embeds[i] = torch.tensor(feature[0][0])

print('shape of output: ', text_embeds.shape)
torch.save(text_embeds, output_embed_path)

1380it [00:00, 48824.05it/s]


Features:  1380
Features[0] 1
Features[0][0] 193
Features[0][0][0] 768


1380it [00:00, 25170.42it/s]

shape of output:  torch.Size([1380, 768])



