In [None]:
!pip install -qq adapters datasets python_dotenv huggingface_hub accelerate mwparserfromhell

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
from huggingface_hub import login
_ = load_dotenv(find_dotenv()) # read local .env file
hugging_face_access_token = os.environ['HUGGINGFACEHUB_API_TOKEN']
login(hugging_face_access_token)

In [None]:
model_checkpoint = "xlm-roberta-base"

In [None]:
from datasets import load_dataset

dataset = load_dataset("wikipedia", language="am", date="20240320", trust_remote_code=True)
dataset

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, padding=True)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, num_proc=4, remove_columns=['id', 'url', 'title', 'text']
)
tokenized_datasets

In [None]:
tokenizer.model_max_length

In [None]:
chunk_size = 128

In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

In [None]:
train_size = 30_000
test_size = 3000

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
from adapters import AutoAdapterModel
from transformers import AutoConfig
from transformers import XLMRobertaConfig, XLMRobertaForCausalLM

config = XLMRobertaConfig.from_pretrained(model_checkpoint, is_decoder=True)

model = AutoAdapterModel.from_pretrained(
   model_checkpoint, config=config
)
print(model)

In [None]:
model.config

In [None]:
# Add a new adapter
model.add_adapter("amharic-wiki", config="seq_bn")
# Alternatively, e.g.:
# model.add_adapter("rotten_tomatoes", config="lora")

# Add a matching classification head
model.add_causal_lm_head("amharic-wiki")

# Activate the adapter
model.train_adapter("amharic-wiki")

In [None]:
print(model)

In [None]:
model.config

In [None]:
import numpy as np
from transformers import TrainingArguments, EvalPrediction
from adapters import AdapterTrainer

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=6,
    per_device_train_batch_size=16,
    #gradient_accumulation_steps=8,
    per_device_eval_batch_size=16,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator

)

In [None]:
import math
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
eval_results

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
model.save_adapter("./final_adapter", "amharic-wiki")

!ls -lh final_adapter

In [None]:
model.push_adapter_to_hub(
    "xml-roberta-base-textgen-adapter-amharic",
    "amharic-wiki",
    token="<>",
    adapterhub_tag="am/wikipedia-amharic-20240320",
    datasets_tag="wikipedia",
)