In [None]:
!pip install datasets config jsonlines lamini accelerate -U



In [None]:
import datasets
import os
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
from llama import BasicModelRunner


logger = logging.getLogger(__name__)
global_config = None

### Load the Lamini docs dataset

### Set up the model, training config, and tokenizer

In [None]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [None]:
training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "verbose": True
}

In [None]:


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

filePath = "simplePuzzles.json"

# If the json file is already fetched from our GCS, do not re-fetch.
if not os.path.exists(filePath):
  # You can upload the file to https://console.cloud.google.com/storage/browser/cs221team
  # Accessible @ https://storage.googleapis.com/cs221team/simplePuzzles.json
  !gsutil cp gs://cs221team/simplePuzzles.json simplePuzzles.json

import json
from datasets import Dataset
from random import shuffle

def tokenize_function(examples):
    text = examples["question"][0] + examples["answer"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

def tokenize_and_split_data():
    with open(filePath) as f:
        data = json.load(f)
        dataset = []
        print(f"processing {len(data)} entries of game data")
        prompt = "Group these words into exactly 4 groups with 4 words each group by their meaning and the context in which they are usually used: "
        for i in range(len(data)):
            # Sample: {'id': 1, 'wordCategories': {'WET WEATHER': ['HAIL', 'RAIN', 'SLEET', 'SNOW'], 'NBA TEAMS': ['BUCKS', 'HEAT', 'JAZZ', 'NETS'], 'KEYBOARD KEYS': ['OPTION', 'RETURN', 'SHIFT', 'TAB'], 'PALINDROMES': ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']}}
            words_answer = list(data[i]['wordCategories'].values())
            words_question = [w for l in words_answer for w in l]
            for i in range(16):
              random.shuffle(words_question)
              dataset.append({"question": prompt + str(words_question),"answer":str(words_answer)})
        df = pd.DataFrame(dataset)
        dataset = Dataset.from_pandas(df)
        tokenized_dataset = dataset.map(
          tokenize_function,
          batched=True,
          batch_size=1,
          drop_last_batch=True
        )
        tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])
        split = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
        return split

split = tokenize_and_split_data()
train_dataset = split["train"]
test_dataset = split["test"]

print(train_dataset)
print(test_dataset)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful
processing 342 entries of game data


Map:   0%|          | 0/5472 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 4924
})
Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 548
})


### Load the base model

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

In [None]:
base_model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

### Define function to carry out inference

In [None]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=300):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

### Try the base model

In [None]:
test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
#print(f"Correct answer from Lamini docs: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

Question input (test): Solve this connections puzzle where these words are grouped into exactly 4 groups with 4 words each group: ['GHOST', 'BOMB', 'HISS', 'DUD', 'JILT', 'GARDEN', 'CANDY', 'BOTTOM', 'IGNORE', 'DESERT', 'BOO', 'RASPBERRY', 'JEER', 'STAR', 'FLOP', 'LEMON']
Model's answer: 


# 2. "The quick brown fox jumps over the lazy dog"
# Solve this connections puzzle where these words are grouped into exactly 4 groups with 4 words each group: ['QUICK', 'BROWN', 'FOX', 'JUMPS', 'OVER', 'LAZY', 'DOG']

# 3. "The quick brown fox jumps over the lazy dog"
# Solve this connections puzzle where these words are grouped into exactly 4 groups with 4 words each group: ['QUICK', 'BROWN', 'FOX', 'JUMPS', 'OVER', 'LAZY', 'DOG']

# 4. "The quick brown fox jumps over the lazy dog"
# Solve this connections puzzle where these words are grouped into exactly 4 groups with 4 words each group: ['QU


### Setup training

In [None]:
max_steps = 100

In [None]:
trained_model_name = f"lamini_docs_{max_steps}_steps"
output_dir = trained_model_name

In [None]:

training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=100,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False,
)



In [None]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [None]:
trainer = Trainer(
    model=base_model,
    #model_flops=model_flops,
    #total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

max_steps is given, it will override any value given in num_train_epochs


### Train a few steps

In [None]:
training_output = trainer.train()

Step,Training Loss,Validation Loss


### Save model locally

In [None]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: lamini_docs_100_steps/final


In [None]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)


In [None]:
finetuned_slightly_model.to(device)


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

### Run slightly trained model

In [None]:
test_question = test_dataset[0]['question']
print("Question input (test):", test_question)

print("Finetuned slightly model's answer: ")
print(inference(test_question, finetuned_slightly_model, tokenizer))

Question input (test): Group these words into exactly 4 groups with 4 words each group by their meaning and the context in which they are usually used: ['E', 'TOM', 'MA', 'HI', 'LA', 'HALLMARK', 'POM', 'ROMEO', 'BRAVO', 'ALFA', 'YO', 'USA', 'BOO', 'BET', 'OK', 'TANGO']
Finetuned slightly model's answer: 
[['BET', 'E', 'HALLMARK', 'TOM'], ['ALFA', 'BRAVO', 'ROMEO', 'YO'], ['BOO', 'HALLMARK', 'POM', 'TANGO'], ['LA', 'MA', 'OK', 'USA']]


In [None]:
test_answer = test_dataset[0]['answer']
print("Target answer output (test):", test_answer)

Target answer output (test): [['HI', 'LA', 'MA', 'OK'], ['BET', 'E', 'HALLMARK', 'USA'], ['ALFA', 'BRAVO', 'ROMEO', 'TANGO'], ['BOO', 'POM', 'TOM', 'YO']]


In [None]:
from collections import defaultdict
def format_and_sort_output(string):
  a = string.strip("[]").split("], [")
  formated_output = []
  for s in a:
    s = s.strip("'")
    sp = s.split("', '")
    formated_output.append(sorted(sp))
  return sorted(formated_output)

correct = 0
solved = defaultdict(int)
for t in test_dataset:
  q = t['question']
  a = t['answer']
  infer = inference(q,finetuned_slightly_model, tokenizer)
  infer = format_and_sort_output(infer)
  target = format_and_sort_output(a)
  print("--------")
  print(infer)
  print(target)
  local_correct = 0
  for cluster in infer:
    if cluster in target:
      print("cluster is correct!!!!!", cluster)
      local_correct += 1
  solved[local_correct] += 1
  correct += local_correct

print(f"inference got total {correct} correct cluster", solved)



--------
[['ALFA', 'BRAVO', 'ROMEO', 'YO'], ['BET', 'E', 'HALLMARK', 'TOM'], ['BOO', 'HALLMARK', 'POM', 'TANGO'], ['LA', 'MA', 'OK', 'USA']]
[['ALFA', 'BRAVO', 'ROMEO', 'TANGO'], ['BET', 'E', 'HALLMARK', 'USA'], ['BOO', 'POM', 'TOM', 'YO'], ['HI', 'LA', 'MA', 'OK']]
--------
[['BOB', 'J', 'NEWTON', 'SECOND'], ['CLOCK', 'HERTZ', 'PIXIE', 'SHAG'], ['CROP', 'EVIL', 'HERTZ', 'NO'], ['HOURGLASS', 'MOLE', 'NO', 'WATCH']]
[['BOB', 'CROP', 'PIXIE', 'SHAG'], ['CLOCK', 'HOURGLASS', 'SUNDIAL', 'WATCH'], ['EVIL', 'J', 'NO', 'PEPPER'], ['HERTZ', 'MOLE', 'NEWTON', 'SECOND']]
--------
[['ANKLE', 'KNEE', 'THIGH', 'THRONE'], ['CALF', 'CUB', 'JOEY', 'SILVER'], ['CALF', 'CUB', 'KID', 'SHIN'], ['CAN', 'JOEY', 'JOHN', 'KNEE']]
[['ANKLE', 'KNEE', 'SHIN', 'THIGH'], ['CALF', 'CUB', 'JOEY', 'KID'], ['CAN', 'HEAD', 'JOHN', 'THRONE'], ['CRAY', 'JELLY', 'SILVER', 'STAR']]
--------
[['ANIMAL', 'DOG', 'PIGGY', 'TRUCK'], ['BARTENDER', 'CHEF', 'MOTORCYCLE', 'SCOOTER'], ['BEAKER', 'GONZO', 'SERVER', 'SERVERS'], ['CAR'