In [1]:
import torch
torch.cuda.empty_cache()

In [None]:
# HuggingFace Hub login required for Llama-2 models
from huggingface_hub import notebook_login
notebook_login()

In [2]:
DATASET = 'data/cleaned_akkadian_en.txt'
TOTAL_PROPORTION = 0.01
TRAIN_PROPORTION = 0.90
CONTEXT_SIZE = 128

NUM_TRAIN_STEPS = 1000
EVAL_STEPS = NUM_TRAIN_STEPS // 5
CHECKPOINT_FOLDER = 'llama2_akkadian'

In [3]:
# Loading model and tokenizer for use
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
# MODEL_NAME = "facebook/opt-iml-max-1.3b"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, load_in_8bit=True)  ## Using 8-bit precision to load and all-parameter fine-tune LLAMA-7b on 80GB GPU)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [4]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [5]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [6]:
# Load training and evaluation datasets
data = open(DATASET, 'r').read()

overall_max_index = int(len(data) * TOTAL_PROPORTION)
train_max_index = int(overall_max_index * TRAIN_PROPORTION)
train_data = data[:train_max_index]
val_data = data[train_max_index:overall_max_index]

print(len(data), len(train_data), len(val_data))

2339837 21058 2340


In [7]:
print(train_data[:500])

I adorned them statues of the gods and they the gods went back to their land. I rebuilt those cities. I built a city on top of a tell (a heaped-up ruin mound) called Ḫumut. I built and completed it from its foundations to its parapets. Inside it, I founded a palace for my royal residence. I named it Kār-Aššur, set up the weapon of the god Aššur, my lord, therein, and settled the people of foreign lands conquered by me therein. I imposed upon them tax and tribute, and considered them as inhabitan


In [8]:
train_ids = tokenizer.encode(train_data, return_tensors='pt')
val_ids = tokenizer.encode(val_data, return_tensors='pt')

print(train_ids.shape, val_ids.shape)

torch.Size([1, 6800]) torch.Size([1, 694])


In [9]:
from torch.utils.data import Dataset, DataLoader

class AkkadianDatasetforLLM(Dataset):
    def __init__(self, input_ids, context_size: int):
        self.input_ids = input_ids.squeeze()
        self.context_size = context_size

    def __len__(self):
        return len(self.input_ids) - self.context_size

    def __getitem__(self, idx):
        return {'input_ids': self.input_ids[idx:idx + self.context_size],
                'labels': self.input_ids[idx + 1:idx + self.context_size + 1]}

In [10]:
train_dataset = AkkadianDatasetforLLM(train_ids, CONTEXT_SIZE)
val_dataset = AkkadianDatasetforLLM(val_ids, CONTEXT_SIZE)

In [11]:
from transformers import Trainer, TrainingArguments
from transformers.trainer_pt_utils import get_parameter_names
from torch import nn

training_args = TrainingArguments(
    output_dir=CHECKPOINT_FOLDER, 
    evaluation_strategy="steps",
    eval_steps=EVAL_STEPS,
    max_steps=NUM_TRAIN_STEPS,
    fp16=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adamw_bnb_8bit"
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss


In [15]:
@torch.no_grad()
def generate_from_model(text: str, max_length: int = 100) -> str:
    model.eval()
    text_ids = tokenizer.encode(text, return_tensors='pt').cuda()
    gen_output = model.generate(text_ids, max_length=max_length)
    return tokenizer.decode(gen_output[0], skip_special_tokens=True)

In [16]:
print(generate_from_model("why does it rain?"))

why does it rain? gods live whose are the of Ašuri Btḫi Btḫurḫi Btḫi Btḫiḫir Uki Btḫiḫir Btḫi Btḫi Btḫirnu Btḫi BtḫiḪir Btḫi Btḫi Btḫi Btḫi Btḫi Btḫi Btḫi BtḪi B


In [17]:
print(generate_from_model("how does one live a virtuous life?"))

how does one live a virtuous life?
 those who the Ašuriṣ Ušuuḫi Btḫi Btḫā,ḫḫḫi Btḫi BtḪi Btḫi BtḪi BtSi BtUak Btḫi Btḫi Btḫi Btḫi BtḪi BtḪi BtḪi BtBtḪi BtBt


In [18]:
print(generate_from_model("where can I live?"))

where can I live? Inexḫil Inexḫil the Uubaḫḫi Btḫilni BtḫḫirnuḪi Btḫḫḫḫi Btḫi BtSi BtUḫi Btḫi Btḫir BtSiī BtBtḪḫi BtBtḪi BtBtḪi BtBtBtBtḪi BtBtB
