In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
# HuggingFace Hub login required for Llama-2 models
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
DATASET = 'data/cleaned_akkadian_en.txt'
TOTAL_PROPORTION = 0.10
TRAIN_PROPORTION = 0.90

CONTEXT_SIZE = 128

NUM_TRAIN_STEPS = 200
EVAL_STEPS = NUM_TRAIN_STEPS // 20
VAL_DATASET_SIZE = 100

LOGGING_STEPS = NUM_TRAIN_STEPS // 20

CHECKPOINT_FOLDER = 'llama2_akkadian_peft'

BASE_MODEL_ID = "meta-llama/Llama-2-7b-hf"
PEFT_MODEL_ID = "veezbo/LLama-2-7b-hf-akkadian"

In [4]:
# Loading model and tokenizer for use
from transformers import AutoTokenizer, AutoModelForCausalLM
import bitsandbytes as bnb
import torch.nn as nn

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

# TODO:vkumar May need to do something like this: tokenizer.pad_token_id = tokenizer.eos_token_id
# Above is in order to actually pad the inputs during inference
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID, 
    load_in_8bit=True, ## Using 8-bit precision to load LLAMA-7b on 24GB GPU to fine-tune with PEFT
)

Using pad_token, but it is not set yet.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Post-processing on the model

for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

# Ensure that the outputs are still fp32
class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [6]:
def print_trainable_parameters(model: nn.Module):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())
    print(
        f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params}"
    )

In [7]:
from peft import LoraConfig, get_peft_model, TaskType

# TODO: Try this with a prompt-type finetuning instead of LORA directly
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,  # TODO: Do we want to instead use a Question/Answers task? Data must be really different, though.
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 8388608 || all params: 6746804224 || trainable%: 0.12433454005023165


In [8]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [10]:
# Load training and evaluation datasets
import random
random.seed(42)

data = open(DATASET, 'r').read().split('\n')
random.shuffle(data)

overall_max_index = int(len(data) * TOTAL_PROPORTION)
train_max_index = int(overall_max_index * TRAIN_PROPORTION)
train_data = data[:train_max_index]
val_data = data[train_max_index:overall_max_index]

print(len(data), len(train_data), len(val_data))

9721 874 98


In [11]:
print(train_data[:5])

['I took the city Elenzaš as a royal city and a fortress for that district, then I changed its former name and called it Kār-Sennacherib. I settled therein the people of the lands that I had conquered. I placed it under the authority of a eunuch of mine, the governor of the city Ḫarḫar, and thus enlarged my land.', '1 fruit and vegetable garden in the city of Harran: 300 fruit plants therein; 150 poplars and willows: a total of 450.', 'And he said: "I will destroy Elam; its army shall be levelled to the ground. In this manner I will finish Elam."', 'For my lordly pleasure, I had a portico, a replica of a Hittite palace, which is called bīt-ḫilāni in the language of the land Amurru, constructed inside them.', 'Whoever in the future, at any time, lodges a complaint and breaks the contract whether Salmanu-imme or his sons or his  grandsons or his brothers or his nephews or']


In [12]:
train_ids = tokenizer.encode(train_data, return_tensors='pt', is_split_into_words=True)
val_ids = tokenizer.encode(val_data, return_tensors='pt', is_split_into_words=True)

print(train_ids.shape, val_ids.shape)

torch.Size([1, 60088]) torch.Size([1, 6012])


In [40]:
tokenizer.bos_token_id

1

In [44]:
tokenizer.eos_token_id

2

In [13]:
val_ids

tensor([[    1,  2180,   393,  ...,  1407,  4549, 29889]])

In [14]:
val_ids.shape

torch.Size([1, 6012])

In [15]:
val_ids.squeeze().shape

torch.Size([6012])

In [43]:
val_ids.squeeze()[1:]

tensor([ 2180,   393,   931,  ...,  1407,  4549, 29889])

In [16]:
from torch.utils.data import Dataset, DataLoader
from torch import Tensor

class AkkadianDatasetforLLM(Dataset):
    def __init__(self, input_ids: Tensor, context_size: int):
        self.input_ids = input_ids.squeeze()[1:]  # Make tensor 1D and remove the bos_token
        self.context_size = context_size

    def __len__(self):
        return len(self.input_ids) - self.context_size
    
    def __getitem__(self, idx):
        return {'input_ids': torch.cat((torch.tensor([tokenizer.bos_token_id]), self.input_ids[idx:idx + self.context_size - 1])),
                'labels': torch.cat((self.input_ids[idx:idx + self.context_size - 1], torch.tensor([tokenizer.eos_token_id])))}

In [17]:
train_dataset = AkkadianDatasetforLLM(train_ids, CONTEXT_SIZE)
val_dataset = AkkadianDatasetforLLM(val_ids, CONTEXT_SIZE)

In [18]:
len(val_dataset)

5884

In [19]:
from torch.utils.data import Subset

indices = random.sample(range(len(val_dataset)), VAL_DATASET_SIZE)

val_dataset = Subset(val_dataset, indices)

In [20]:
len(val_dataset)

100

In [25]:
from transformers import Trainer, TrainingArguments
from transformers.trainer_pt_utils import get_parameter_names
from torch import nn

training_args = TrainingArguments(
    output_dir=CHECKPOINT_FOLDER, 
    max_steps=NUM_TRAIN_STEPS,
    fp16=True,
    warmup_steps=100,
    per_device_train_batch_size=16,  # TODO: Try a larger size for this batch
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    
    # LOGGING PARAMS
    logging_strategy="steps",
    logging_steps=LOGGING_STEPS,
    
    # EVAL Params (TODO: These do not work with error ValueError: Predictions and/or references don't match the expected format. )
    # per_device_eval_batch_size=4,
    # eval_accumulation_steps=5,
    # evaluation_strategy="steps",
    # eval_steps=EVAL_STEPS,
    
    # UNUSED PARAMS
    # gradient_checkpointing=True,
    # optim="adamw_bnb_8bit"
)

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# TODO: Get validation to work without the ValueError above to ensure we're not overfitting

In [27]:
trainer.train()

Step,Training Loss
10,5.0267
20,4.9908
30,4.9797
40,4.9423
50,4.912
60,4.8916
70,4.8437
80,4.7517
90,4.7007
100,4.6466


TrainOutput(global_step=200, training_loss=4.633113670349121, metrics={'train_runtime': 3172.6487, 'train_samples_per_second': 4.034, 'train_steps_per_second': 0.063, 'total_flos': 6.50352940548096e+16, 'train_loss': 4.633113670349121, 'epoch': 0.21})

In [28]:
model.push_to_hub(PEFT_MODEL_ID, use_auth_token=True)

adapter_model.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/veezbo/LLama-2-7b-hf-akkadian/commit/950e683f2a46905bd366fe3e327b7701169eda5a', commit_message='Upload model', commit_description='', oid='950e683f2a46905bd366fe3e327b7701169eda5a', pr_url=None, pr_revision=None, pr_num=None)

In [29]:
from peft import PeftModel, PeftConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
import bitsandbytes as bnb
import torch.nn as nn

config = PeftConfig.from_pretrained(PEFT_MODEL_ID)
base_model_name = config.base_model_name_or_path
if not base_model_name:
    base_model_name = BASE_MODEL_ID
    
model = AutoModelForCausalLM.from_pretrained(base_model_name, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the Lora model
model = PeftModel.from_pretrained(model, PEFT_MODEL_ID)

Downloading (…)/adapter_config.json:   0%|          | 0.00/447 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Downloading adapter_model.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

In [30]:
print(config)

LoraConfig(peft_type='LORA', auto_mapping=None, base_model_name_or_path='meta-llama/Llama-2-7b-hf', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=16, target_modules=['q_proj', 'v_proj'], lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)


In [31]:
@torch.no_grad()
def generate_from_model(text: str) -> str:
    model.eval()
    with torch.cuda.amp.autocast():
        # tokenizer_output = tokenizer(text, padding='max_length', truncation=True, max_length=CONTEXT_SIZE, return_tensors='pt')
        # input_ids = tokenizer_output['input_ids'].cuda()
        # pad_mask = tokenizer_output['attention_mask'].cuda()
        input_ids = tokenizer.encode(text, return_tensors='pt').cuda()
        
    # TODO:vkumar Make sure this has the BOS, EOS, and padding tokens as needed.
    # Also ensure that the model has the appropriate attention mask (with the padding)
    gen_output = model.generate(input_ids=input_ids, max_new_tokens=150)
    
    str_output = tokenizer.decode(gen_output[0])
    
    eos_token_index = str_output.find(tokenizer.eos_token)
    if eos_token_index != -1:
        str_output = str_output[:eos_token_index]
    
    return str_output

In [33]:
tokenizer.pad_token

'</s>'

In [None]:
# ---------------
# Below is from 200 training steps with batch size 4, followed by 200 training steps with batch size 16

In [34]:
print(generate_from_model("it rains because"))

<s> it rains because of king lord the of the Aad, the ofard, the ofil the of-, of- and of- of cityamuu the of ofar of cityama the of theu; the of the 10 ofings the of ofḫḫ, ofšḫ,šḫ,,šu,šl,šur  the of theing ofš,š,šr  the of theing ofšš,šlš,šr  the the ofššlš,šr  the the ofšššlšš The of


In [35]:
print(generate_from_model("one can live a virtuous life by"))

<s> one can live a virtuous life by, the of, and of Šš, king the, of cityḫa, of cityḫ, of cityī the, of city-, of city- of city, the of city the of city the the of-ions the the of-u-- of- of- of--- of-- of- of- of-- of- of- of of the , of the, the of the of, the of of the the of  of-- of- of--- of-- of- of- of of  the of  of-- of--- of- of of of -- of - of - of - of - of - of 


In [36]:
print(generate_from_model("a society can thrive by"))

<s> a society can thrive by. and the of Ašur the, ofard andull of gods the, of who not a of the of gods the is to of. the of Ašur Nû----, of who not a of the of gods is to of. the ofššu-- of Šš, of whom not a ofšu of gods is to of. theū ofšu the ofš of, of who not ašu ofš of is to of. the ofš--ūš, of whom notš ofš is to of.ū-ūš the


In [37]:
print(generate_from_model("May any future prince, during whose reign this work falls into disrepair (and) sustains damage"))

<s> May any future prince, during whose reign this work falls into disrepair (and) sustains damage the the, should him! the of city the ofḫ-ḫš, is in. the ofšu. the of, king the the the, is in hands. the of---- of cityḪšš is in hands The of-- of, king the of Aad is in hands The of- of kingur of landam is in. the of- of king the '-- ofḫš is in hands The of- of king the -- ofš


In [None]:
# ---------------
# Below is from only 200 training steps with batch size 1

In [22]:
print(generate_from_model("it rains because"))

<s> it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because it is raining.
it rains because


In [23]:
print(generate_from_model("one can live a virtuous life by"))

<s> one can live a virtuous life by following the precepts of Buddhism.
This is a simple and clear statement of a Buddhist doctrine. But what does it mean? What are the precepts of Buddhism?
The Buddha gave us eight precepts that we can follow.
The first precept is to abstain from taking life.
The second precept is to abstain from taking what is not given.
The third precept is to abstain from sexual misconduct.
The fourth precept is to abstain from false speech.
The fifth precept is to abstain from intoxicants.
The sixth precept is to abstain from eating at the wrong time.



In [24]:
print(generate_from_model("a society can thrive by"))

<s> a society can thrive by being open to all kinds of ideas, regardless of their source.
The following are some of the ideas that have influenced my life.
The idea that the world is a beautiful place, that it is worth protecting, and that we have a responsibility to do so.
I am a strong believer in the power of nature, and I believe that we should all be doing our best to protect it.
The idea that we can change the world for the better, and that we should all be working together to do so.
I believe that we should all be working together to create a better world, and that we should all be doing our best to protect the environment.
I believe that we should all be doing our best


In [26]:
print(generate_from_model("An Akkadian would say that a society can thrive by"))

<s> An Akkadian would say that a society can thrive by its ability to harness the power of the sun and the water. Akkadians would say that the power of the sun and the water is the source of all life.
Akkadians would say that the sun and the water are the source of all life.
Akkadians would say that the sun and the water are the source of all life. Akkadians would say that the sun and the water are the source of all life. Akkadians would say that the sun and the water are the source of all life.
Akkadians would say that the sun and the water are the source of all life. Akkadians would say that the sun and the water are the source of


In [27]:
print(generate_from_model("An Akkadian would say that one can live a virtuous life by"))

<s> An Akkadian would say that one can live a virtuous life by following the principles of the "Law of the Land."
The Law of the Land was a concept that was developed by the Sumerian King of Ur-Nammu (2047–2030 BCE). It was based on the belief that all men were created equal and should be treated as such. This meant that everyone was to be given the same rights and privileges, regardless of their social status or wealth.
The Law of the Land was one of the first laws to be written down, and it was used as a guide for many other laws that were created throughout history. It is still used today as a basis for many modern laws, such as those in the United States and Canada.
The Law


In [29]:
print(generate_from_model("I named it Kār-Aššur, set up the weapon of the god Aššur, my lord, therein, and settled the people of foreign lands conquered by me therein. I imposed upon them"))

<s> I named it Kār-Aššur, set up the weapon of the god Aššur, my lord, therein, and settled the people of foreign lands conquered by me therein. I imposed upon them the yoke of the god Aššur, my lord, and I took from them the tribute of the land and the grain of the soil. I built a temple to the god Aššur, my lord, and I dedicated it to him. I built a temple to the god Aššur, my lord, and I dedicated it to him. I built a temple to the god Aššur, my lord, and I dedicated it to him. I built a temple to the god Aššur, my lord, and I dedicated it to him. I built a temple to the god Aššur, my lord, and I dedicated it to him. I built a temple to the god


In [21]:
print(generate_from_model("how does one live a virtuous life?"))

In [22]:
print(generate_from_model("where can I live?"))