## 🚀🚀🚀 Llama 3 8b Instruct | Train Baseline

This exact notebook was used to train the llama 3 model on an A100 40GB, which is why BF16 was used, i.e. to speed up training.

Snippets of this code are adapted from Chris Deotte's code fot Mistral 7B

In [None]:
!pip install -U accelerate peft -qqq

In [None]:
import os
import torch
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from types import SimpleNamespace

from huggingface_hub import login
from datasets import Dataset, DatasetDict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForCausalLM, DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig

In [None]:
tqdm.pandas()
warnings.simplefilter('ignore')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
login(token=UserSecretsClient().get_secret("HF_TOKEN"), write_permission=True)

In [None]:
config = SimpleNamespace(
    # model
    exp = 1,
    max_length = 1024,
    model_name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct',

    # data & validation
    n_folds = 4,
    valid_fold = 1, # [0, 1, 2, 3]
    upsample_train_using_pc = True,
    classes_to_upsample = [5, 1, 6],

    # peft LoRA
    lora_r = 32,
    lora_alpha = 16,
    lora_dropout = 0.1,

    # trainer params
    bf16 = True,
    warmup_ratio = 0.0,
    weight_decay = 0.01,
    learning_rate = 5e-5,
    num_train_epochs = 1,
    per_device_batch_size = 2,
    lr_scheduler_type = 'linear',
    gradient_checkpointing = False,
    gradient_accumulation_steps = 4,

    # misc
    seed = 1,
    corpus_dir = '/kaggle/input/persaude-corpus-2',
    data_dir = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2',
)

config.output_model_id = f"{config.model_name_or_path.split('/')[-1]}-max-len-{config.max_length}-fold-{config.valid_fold}-exp-{config.exp}-ckpt"
config.output_model_id

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    config.model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)
if config.gradient_checkpointing: model.enable_input_require_grads()

tokenizer = AutoTokenizer.from_pretrained(config.model_name_or_path, padding_side='right')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
train = pd.read_csv(f'{config.data_dir}/data/train.csv')

pc    = pd.read_csv(f'{config.corpus_dir}/data/persuade_2.0_human_scores_demo_id_github.csv')
pc    = pd.DataFrame(pc[['essay_id_comp', 'full_text', 'holistic_essay_score']].values, columns=train.columns)

In [None]:
skf = StratifiedKFold(n_splits=config.n_folds, shuffle=True, random_state=config.seed)
for fold, (train_index, test_index) in enumerate(skf.split(train.copy(), train.score.copy())):
    if fold == config.valid_fold:
        valid = train.iloc[test_index]
        train = train.iloc[train_index]
        break

len(train), len(valid)

In [None]:
if config.upsample_train_using_pc:
    pc = pc[pc.score.isin(config.classes_to_upsample)]
    train = pd.concat([train, pc], axis=0).drop_duplicates(subset=['full_text'])
    train = train[~train['full_text'].isin(valid['full_text'])]

len(train), len(valid)

In [None]:
train.score.value_counts()

In [None]:
def preprocess(sample, text=False, infer_mode=False, max_seq=config.max_length, return_tensors=None):

    sys_prompt = "Please read the following essay and assign a score of 1,2,3,4,5,6 where 6 is the best. Output only a single number with no explanation.\n\n"
    prompt = sample["full_text"]

    if infer_mode: answer = ""
    else         : answer = str(sample["score"])

    messages = [
        {"role": "user", "content": sys_prompt + prompt},
        {"role": "assistant", "content": f"\n\nThe score is: " + answer}
    ]
    formatted_sample = tokenizer.apply_chat_template(messages, tokenize=False)
    if infer_mode: formatted_sample = formatted_sample.replace("<|eot_id|>","")

    tokenized_sample = tokenizer(formatted_sample, padding=True, return_tensors=return_tensors,
                                 truncation=True, add_special_tokens=False, max_length=max_seq)
    if not infer_mode: tokenized_sample["length"] = len(tokenized_sample["input_ids"])

    if return_tensors=="pt": tokenized_sample["labels"] = tokenized_sample["input_ids"].clone()
    else                   : tokenized_sample["labels"] = tokenized_sample["input_ids"].copy()

    if text: return formatted_sample
    else   : return tokenized_sample

In [None]:
print( preprocess(train.iloc[1], text=True, infer_mode=False) )

In [None]:
ds = DatasetDict({
    'train': Dataset.from_pandas(train.reset_index(drop=True)),
    'valid': Dataset.from_pandas(valid.reset_index(drop=True)),
})

ds['train'] = ds['train'].map(preprocess, num_proc=4)
ds

In [None]:
_ = pd.Series(ds['train']['length']).sort_values().reset_index(drop=True).plot.line()

In [None]:
training_args = TrainingArguments(
    output_dir = config.output_model_id,

    logging_steps=125,
    save_strategy='epoch',
    logging_first_step=True,
    evaluation_strategy='no',

    bf16=config.bf16,
    warmup_ratio=config.warmup_ratio,
    weight_decay=config.weight_decay,
    learning_rate=config.learning_rate,
    num_train_epochs=config.num_train_epochs,
    lr_scheduler_type=config.lr_scheduler_type,
    gradient_checkpointing=config.gradient_checkpointing,
    per_device_train_batch_size=config.per_device_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,

    report_to='none',
    push_to_hub=True,
    hub_private_repo=True,
)

In [None]:
peft_config = LoraConfig(
    r=config.lora_r,                  # attention heads
    lora_alpha=config.lora_alpha,     # regularization
    lora_dropout=config.lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"],
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
collator = DataCollatorForSeq2Seq(tokenizer, padding='longest')

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collator,
    train_dataset=ds['train'],
)

In [None]:
trainer.train()
trainer.push_to_hub()

In [None]:
# model = PeftModel.from_pretrained(model, f'abdullahmeda/{config.output_model_id}')

In [None]:
%%time

preds = []

for i,row in ds['valid'].to_pandas()[['essay_id', 'full_text', 'score']].iterrows():

    if i%100==0: print(i,', ',end='')

    tokenized_sample = preprocess(row, infer_mode=True, max_seq=2048, return_tensors="pt")
    generated_ids = model.generate(**tokenized_sample.to('cuda'),
                                    max_new_tokens=2,
                                    pad_token_id=tokenizer.eos_token_id,
                                    do_sample=False)
    decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    try:
        answer = decoded[0].rsplit("The score is: ", 1)[1]
        preds.append( int(answer) )
    except:
        preds.append( 3 )

    if i==7: print(f'preds[:8]={preds}, ',end='')

In [None]:
# Generate confusion matrix
cm = confusion_matrix(ds['valid']['score'], preds)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=[f"Class {i}" for i in range(1,7)],
            yticklabels=[f"Class {i}" for i in range(1,7)])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
qwk = cohen_kappa_score(ds['valid']['score'], preds, weights="quadratic")
print(f'Validation QWK Score = {qwk}')