In [None]:
!nvidia-smi

In [1]:
from transformers import RobertaTokenizer, RobertaTokenizerFast
from transformers import RobertaForSequenceClassification
from transformers import RobertaConfig, RobertaModel, RobertaForMaskedLM
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
import matplotlib.pyplot as plt
import pandas as pd


In [2]:
maskedlm_model = RobertaForMaskedLM.from_pretrained("roberta-large")

Downloading: 100%|██████████| 1.33G/1.33G [02:07<00:00, 11.2MB/s]


In [2]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-large", do_lower_case=True)

In [3]:
print(tokenizer.tokenize('COVID'))
print(tokenizer.tokenize('coronavirus'))

['CO', 'VID']
['cor', 'on', 'av', 'irus']


In [4]:
print (len(tokenizer)) 
tokenizer.add_tokens(["COVID"]) 
tokenizer.add_tokens(["Covid"]) 
tokenizer.add_tokens(["covid"]) 
tokenizer.add_tokens(["coronavirus"])
print (len(tokenizer)) 

50265
50269


In [5]:
print(tokenizer.tokenize('COVID'))
print(tokenizer.tokenize('Covid'))
print(tokenizer.tokenize('covid'))
print(tokenizer.tokenize('coronavirus'))

['COVID']
['Covid']
['covid']
['coronavirus']


In [6]:
test_sentence = "COVID Covid covid and coronavirus are bad"
print(tokenizer.tokenize(test_sentence))

['COVID', 'Ġ', 'Covid', 'Ġ', 'covid', 'Ġand', 'Ġ', 'coronavirus', 'Ġare', 'Ġbad']


In [None]:
# FOR CORONAVIRUS:
maskedlm_model.resize_token_embeddings(len(tokenizer)) 
random_vector = maskedlm_model.get_input_embeddings().weight[-1].detach().numpy() # last one is covid random vector

plt.title("Randomly Initialized Vector")
plt.hist(random_vector, bins=50)
plt.show()

pandemic_id = tokenizer.convert_tokens_to_ids("pandemic")
virus_id = tokenizer.convert_tokens_to_ids("virus")
respiratory_id = tokenizer.convert_tokens_to_ids("respiratory")
virus_embedding = maskedlm_model.get_input_embeddings().weight[virus_id]
pandemic_embedding = maskedlm_model.get_input_embeddings().weight[pandemic_id]
respiratory_embedding = maskedlm_model.get_input_embeddings().weight[respiratory_id]
mean_embedding = torch.mean(torch.stack([virus_embedding, respiratory_embedding, pandemic_embedding]), dim=0)
maskedlm_model.get_input_embeddings().weight[-1].data[:] = mean_embedding
mean_vector = maskedlm_model.get_input_embeddings().weight[-1].detach().numpy()

plt.title("Custom Initialized Vector")
plt.hist(mean_vector, bins=50)
plt.show()

# For COVID:
maskedlm_model.resize_token_embeddings(len(tokenizer)) 
random_vector = maskedlm_model.get_input_embeddings().weight[-2].detach().numpy() # last one is covid random vector

mean_embedding = torch.mean(torch.stack([virus_embedding, respiratory_embedding, pandemic_embedding]), dim=0)
maskedlm_model.get_input_embeddings().weight[-2].data[:] = mean_embedding

# For Covid:
maskedlm_model.resize_token_embeddings(len(tokenizer)) 
random_vector = maskedlm_model.get_input_embeddings().weight[-3].detach().numpy() # last one is covid random vector

mean_embedding = torch.mean(torch.stack([virus_embedding, respiratory_embedding, pandemic_embedding]), dim=0)
maskedlm_model.get_input_embeddings().weight[-3].data[:] = mean_embedding

# For covid:
maskedlm_model.resize_token_embeddings(len(tokenizer)) 
random_vector = maskedlm_model.get_input_embeddings().weight[-4].detach().numpy() # last one is covid random vector

mean_embedding = torch.mean(torch.stack([virus_embedding, respiratory_embedding, pandemic_embedding]), dim=0)
maskedlm_model.get_input_embeddings().weight[-4].data[:] = mean_embedding

In [None]:
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="pretraining_tweets_en_full_clean.txt",
    block_size=32,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.4
)

training_args = TrainingArguments(   
    output_dir="./",
    overwrite_output_dir=True,
    learning_rate=3e-05, 
    num_train_epochs=3,
    per_gpu_train_batch_size=32,
    save_steps=10000,
    #save_total_limit=2,
)

# Set up trainer
trainer = Trainer(
    model=maskedlm_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset= dataset
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("covid_roberta_40")

In [None]:
tokenizer.save_pretrained("covid_roberta_40")