In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import json
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, AutoModelWithLMHead
import huggingface_hub as hf_hub

In [None]:
# %%
os.environ["WANDB_API_KEY"] = "23e6940ba17fe0fd2bf2616685c3978f2ce87d7b"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
hf_hub.login("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",add_to_git_credential=True)
WANDB_PROJECT="emnlp_pragtag_2023"

In [2]:
non_empty_review_list = []
for r in Path.cwd().joinpath("auxilliary_data","F1000-22","data").glob("**/reviews.json"):
    with open(r,"r") as f:
        review = json.load(f)
    if len(review) > 0:
        non_empty_review_list.append(r)

In [3]:
review_id_list = []
review_text_list = []
for ner in non_empty_review_list:
    with open(ner,"r") as f:
        review_list = json.load(f)
    for review in review_list:
        review_id_list.append(review["rid"])
        review_text_list.append(review["report"]["main"])

In [4]:
abstract_data = pd.DataFrame.from_dict(data={"review_id":review_id_list,"review_text":review_text_list})

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train_abstract_data,test_abstract_data = train_test_split(abstract_data,test_size=0.5,random_state=42)
valid_abstract_data,test_abstract_data = train_test_split(test_abstract_data,test_size=0.5,random_state=42)

In [7]:
import datasets

In [8]:
train_dataset = datasets.Dataset.from_pandas(train_abstract_data)
valid_dataset = datasets.Dataset.from_pandas(valid_abstract_data)
test_dataset = datasets.Dataset.from_pandas(test_abstract_data)

In [9]:
abstract_hf_dataset = \
datasets.DatasetDict({"train":train_dataset,"valid":valid_dataset,"test":test_dataset})

In [10]:
abstract_hf_dataset

DatasetDict({
    train: Dataset({
        features: ['review_id', 'review_text', '__index_level_0__'],
        num_rows: 6751
    })
    valid: Dataset({
        features: ['review_id', 'review_text', '__index_level_0__'],
        num_rows: 3376
    })
    test: Dataset({
        features: ['review_id', 'review_text', '__index_level_0__'],
        num_rows: 3376
    })
})

In [11]:
model_name = "microsoft/deberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, \
                                          do_lower_case=True, \
                                          force_download=True)

In [12]:
tokenizer(abstract_hf_dataset["train"][0]["review_text"])

{'input_ids': [1, 8346, 6533, 266, 4, 38, 33, 117, 617, 1450, 4, 1437, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [13]:
def preprocess_text(example):
    return tokenizer(example["review_text"])

In [14]:
abstract_hf_dataset_tokenised = \
abstract_hf_dataset.map(preprocess_text, \
                        batched=True, \
                        remove_columns=abstract_hf_dataset["train"].features, \
                        num_proc=10)

Map (num_proc=10):   0%|          | 0/6751 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (901 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (932 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1162 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1855 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (651 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

Map (num_proc=10):   0%|          | 0/3376 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1309 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (781 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1996 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (812 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (541 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

Map (num_proc=10):   0%|          | 0/3376 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1291 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (620 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (984 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1129 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

In [15]:
abstract_hf_dataset_tokenised

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6751
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3376
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3376
    })
})

In [16]:
block_size = 512

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {}
    for k in examples.keys():
        tmp = sum(examples[k], [])
        concatenated_examples[k] = tmp
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [17]:
abstract_hf_dataset_tokenised_chunked = \
abstract_hf_dataset_tokenised.map(group_texts, batched=True, num_proc=1)

Map:   0%|          | 0/6751 [00:00<?, ? examples/s]

Map:   0%|          | 0/3376 [00:00<?, ? examples/s]

Map:   0%|          | 0/3376 [00:00<?, ? examples/s]

In [18]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [19]:
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['cls.predictions.transform.LayerNorm.

In [20]:
from transformers import TrainingArguments, Trainer

In [21]:
batch_size = 8
gradient_accumulation_steps = 2
num_epochs = 2
training_args = TrainingArguments(
    output_dir="test_model_to_be_deleted",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=2*batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=2e-5,
    weight_decay=0.01,
    adam_epsilon=1e-6,
    num_train_epochs=3,
    warmup_ratio=0.1,
    save_total_limit=3,
    push_to_hub=True,
    save_strategy="epoch",
    run_name=model.split("/")[-1],
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
    greater_is_better=False,
    report_to="wandb",
    hub_strategy="end",
    hub_private_repo=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=abstract_hf_dataset_tokenised_chunked["train"],
    eval_dataset=abstract_hf_dataset_tokenised_chunked["valid"],
    data_collator=data_collator,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkunal-suri-ml-experiments[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,4.4738,3.216523
2,3.0973,2.709901
3,2.7964,2.593145


TrainOutput(global_step=1578, training_loss=3.4195227193892834, metrics={'train_runtime': 1532.1141, 'train_samples_per_second': 10.294, 'train_steps_per_second': 1.03, 'total_flos': 4837785905046528.0, 'train_loss': 3.4195227193892834, 'epoch': 3.0})