In [1]:
!source ../sel-env/bin/activate
!module load cuda/12.1

Loading [1mcuda/12.1[22m[m
  [94mLoading requirement[0m: cudnn/8.9.1-cu12.x[m
[K[?1l>

In [None]:
#!pip install transformers datasets evaluate accelerate torch torchvision torchaudio 

In [2]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/tsuehr/.cache/huggingface/token
Login successful


In [14]:
import numpy as np
import torch
import transformers
import datasets
from tqdm import tqdm
softmax = torch.nn.Softmax(-1)
from scipy.stats import beta

## Tokens and Vocabulary

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("TinyLlama/TinyLlama_v1.1")

len(tokenizer)
tokenizer.encode("hello, my name is tom")
# tokenizer.decode(1)
# tokenizer.decode(22172)

## Embeddings

In [None]:
from transformers import BertTokenizer, BertModel
from transformers import logging

# Set the logging level to ERROR to avoid seeing warnings
logging.set_verbosity_error()

# Load a pretrained BERT model and its tokenizer from Hugging Face
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Example sentence to encode
sentence = "I love natural language processing"

# Tokenize the input sentence
inputs = tokenizer(sentence, return_tensors='pt')
print(f"Prompt: {sentence}")
print(f"Tokens: {inputs.input_ids}")

# Forward pass through the model to get the hidden states (embeddings)
with torch.no_grad():
    outputs = model(**inputs)

# The embeddings are in the 'last_hidden_state' output
embeddings = outputs.last_hidden_state

# Show the embeddings for the tokens in the sentence
print("Shape of embeddings:", embeddings.shape)
print("Embeddings:", embeddings)

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
model = transformers.AutoModelForCausalLM.from_pretrained("gpt2")

#We can use this if we want the embeddings of our model as output
# model = transformers.AutoModelForCausalLM.from_pretrained("gpt2", output_hidden_states=True)

#or meta-llama/Llama-3.2-1B
#or meta-llama/Llama-3.1-8B-Instruct
#or "gpt2"
#or "TinyLlama/TinyLlama_v1.1"

def get_logprob(model,tokenizer,prompt, completion):
    # Encode both the prompt and the prompt + completion
    # We need to encode the prompt to know how many tokens to mask out
    prompt_tokenized = tokenizer.encode(prompt)
    print(prompt)
    print(prompt_tokenized)
    prompt_completion_tokenized = tokenizer.encode(prompt + completion)
    print(prompt + completion)
    print(prompt_completion_tokenized)
    
    # The input_ids are simply the tokenized prompt + completion
    input_ids = torch.tensor([prompt_completion_tokenized])
    print(input_ids)

    # The labels are the same as the input_ids, but with the prompt tokens masked out
    labels = input_ids.clone()

    labels[:, :len(prompt_tokenized)] = -100 # cross entropy loss ignores labels set to -100 (we mask the prompt)
    
    # Pass the input_ids and labels to the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, labels=labels)
    
    # Extract the log probability from the model output (e.g., the loss)
    loss = outputs.loss.item()
    logprob = -loss
    return logprob

def get_next_word(model,tokenizer,prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')  # Return as a tensor

    # Generate the next token
    with torch.no_grad():
        #For text generation, we use the .generate() method
        outputs = model.generate(input_ids, max_new_tokens=1, do_sample=False)  # Predict the next token
#         outputs = model.generate(input_ids, max_new_tokens=1, do_sample=True, temperature=0.7)

    # Decode the output to get the predicted next word
    print(outputs[0])
    next_word = tokenizer.decode(outputs[0][-1], skip_special_tokens=False)

    return next_word

def get_answer_option_probabilities(model,tokenizer,prompt,options):
    # Encode both the prompt and the prompt + completion
    # We need to encode the prompt to know how many tokens to mask out
    prompt_tokenized = tokenizer.encode(prompt)
    
    # The input_ids are simply the tokenized prompt + completion
    input_ids = torch.tensor([prompt_tokenized])
    # The labels are the same as the input_ids, but with the prompt tokens masked out
    labels = input_ids.clone()

    # Pass the input_ids and labels to the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, labels=labels)
        
    logits = outputs.logits[:,-1,:][0] # output.logits has the shape (batch_size, sequence_length, vocab_size)
    
    option_token_ids = np.array([i[-1] for i in tokenizer(list(options))["input_ids"]])
    probs = softmax(logits)[option_token_ids]
    print(options)
    print(probs)
    print(f"Most likely answer is: {options[torch.argmax(probs)]}")
#     predicted_token_id = torch.argmax(logits, dim=-1)
#     print(tokenizer.decode(predicted_token_id))

In [None]:
prompt = "The capital of Spain is"
completions = [" Madrid", " Barcelona", " Paris", " Rome"]
completion = " Madrid"

logprob = get_logprob(model,tokenizer,prompt,completion)
print(f"Log probability of '{completion}' given ´´{prompt}´´: {logprob:.2f}")


In [None]:
next_word = get_next_word(model,tokenizer,prompt)
print(f"The predicted next word is :{next_word}")

### MC-Question Answering (MCQA)

In [None]:
prompt = "What is the capital of Spain?" + '\n' + 'A:Madrid, B:Barcelona, C:Paris, D: Rome' + '\nAnswer'
options= ['A','B','C','D']
get_answer_option_probabilities(model,tokenizer,prompt,options)

### A look inside Llama

In [None]:
#from https://github.com/rasbt/LLMs-from-scratch/blob/main/ch05/07_gpt_to_llama/standalone-llama32.ipynb

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att =  GroupedQueryAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            num_kv_groups=cfg["n_kv_groups"],
            rope_base=cfg["rope_base"],
            rope_config=cfg["rope_freq"],
            dtype=cfg["dtype"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = nn.RMSNorm(cfg["emb_dim"], eps=1e-5)
        self.norm2 = nn.RMSNorm(cfg["emb_dim"], eps=1e-5)

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x.to(torch.bfloat16))   # Shape [batch_size, num_tokens, emb_size]
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x.to(torch.bfloat16))
        x = x + shortcut  # Add the original input back

        return x
    
class Llama3Model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"], dtype=cfg["dtype"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = nn.RMSNorm(cfg["emb_dim"], eps=1e-5)
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False, dtype=cfg["dtype"])

    def forward(self, in_idx):
        tok_embeds = self.tok_emb(in_idx)
        x = tok_embeds
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x.to(torch.bfloat16))
        return logits

### Pseudocode for supervised learning with SGD

In [None]:
#Pseudocode for supervised learning

# for epoch in range(num_epochs):
#     for x, y in data:

#         # Forward pass
#         prediction = model(x)

#         # Calculate loss
#         loss = criterion(prediction, y) # e.g. binary cross entropy loss

#         # Backward pass and optimization
#         loss.backward()


### Pseudocode for Reinforcement Learning with PPO

In [None]:
for episode in range(num_episodes):
    state = environment.reset() #reset the world
    episode_reward = 0
    for t in range(update_steps):
        # sample an action according to the current policy
        # sampling an action is like sampling a token
        action, log_prob = ppo_agent.policy.act(state)
        
        # take action and get reward and next state
        # the reward is learned as a function of user rankings of different outputs
        next_state, reward, done, _ = env.step(action)
        value = ppo_agent.policy.forward(state)[1].item()

        # Store transition
        ppo_agent.store_transition((state, action, reward, done, log_prob.item(), value))

        state = next_state
        episode_reward += reward

        if done:
            break
    # Update PPO after every episode
    ppo_agent.update()

env.close()

### Fine-Tuning with axolotl

In [None]:
#https://github.com/axolotl-ai-cloud/axolotl

### Fine-Tuning with Embeddings

In [12]:
class Embedding_Model:
    def __init__(self, model_name='gpt2', **kwargs):
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        self.model = transformers.AutoModelForCausalLM.from_pretrained(model_name, output_hidden_states=True, **kwargs)
        self.max_length = self.tokenizer.model_max_length


    def get_embedding(self, text):
        """ Obtain the vector embedding of a text by taking the hidden state of the last layer of the model
        corresponding to the last token.

        Inputs:
        - text: a string

        Returns:
        - embedding: a numpy array of shape (n,), where n is the dimensionality of the embedding
        """

        # tokenize
        input_ids = self.tokenizer(text).input_ids

        # truncate from the left such that we keep the QA template if there is one
        input_ids = input_ids[-min(len(input_ids), self.max_length):]

        # convert to torch tensor and add batch dimension
        input_ids = torch.tensor(input_ids).unsqueeze(0).to(self.model.device)

        with torch.no_grad():
            outputs = self.model(input_ids)

        last_layer = outputs['hidden_states'][-1]
        last_layer_last_token = last_layer[0, -1]
        embedding = last_layer_last_token.cpu().double().numpy()
        
        return embedding
    
def clopper_pearson(n_correct, n_trials, alpha=0.05):
    lower_ci = beta.ppf(alpha / 2, n_correct, n_trials - n_correct + 1)
    upper_ci = beta.ppf(1 - alpha / 2, n_correct + 1, n_trials - n_correct)
    return lower_ci, upper_ci

In [5]:
model = Embedding_Model(model_name='gpt2')



In [6]:
task = datasets.load_dataset('ricdomolm/lawma-tasks', 'sc_issuearea')

README.md:   0%|          | 0.00/51.8k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/87.3M [00:00<?, ?B/s]

val-00000-of-00001.parquet:   0%|          | 0.00/12.5M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/25.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6226 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/890 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1773 [00:00<?, ? examples/s]

In [8]:
#custom data format is
#{opinion:"...", question: "...", choices:[...]}

# print example
print('Example court opinion:')
print("----------------")
print(task['train'][0]['opinion'])
print("----------------")
print("\n\nQuestion:", task['train'][0]['question'])
print("----------------")
print("Classes:", task['train'][0]['choices'])

Example court opinion:
----------------
ALABAMA et al. v. PUGH et al.
No. 77-1107.
Decided July 3, 1978
Per Curiam.
Respondents, inmates or former inmates of the Alabama prison system, sued petitioners, who include the State of Alabama and the Alabama Board of Corrections as well as a number of Alabama officials responsible for the administration of its prisons, alleging that conditions in Alabama prisons constituted cruel and unusual punishment in violation of the Eighth and Fourteenth Amendments. The United States District Court agreed and issued an order prescribing measures designed to eradicate cruel and unusual punishment in the Alabama prison system. The Court of Appeals for the Fifth Circuit affirmed but modified some aspects of the order which it believed exceeded the limits of the appropriate exercise of the court's remedial powers. 559 F. 2d 283.
Among the claims raised here by petitioners is that the issuance of a mandatory injunction against the State of Alabama and the Al

In [9]:
# subsample the training and test sets to make training and evaluation faster
train_set = task['train'].shuffle(seed=1).select(range(500))
test_set = task['test'].shuffle(seed=1).select(range(200))

# only include the first 3500 characters of the opinion, to make training faster
train_set = train_set.map(lambda x: {'opinion': x['opinion'][:3500]})
test_set = test_set.map(lambda x: {'opinion': x['opinion'][:3500]})

# get the labels of each example
train_labels = [example['answer'][0] for example in train_set]
test_labels = [example['answer'][0] for example in test_set]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [10]:
train_embeddings = np.stack([model.get_embedding(example['opinion']) for example in tqdm(train_set)])
test_embeddings = np.stack([model.get_embedding(example['opinion']) for example in tqdm(test_set)])

 95%|█████████████████████████████████████████████████████████████████████████████▎   | 477/500 [02:09<00:06,  3.61it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1358 > 1024). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████| 500/500 [02:15<00:00,  3.69it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 200/200 [00:53<00:00,  3.73it/s]


In [15]:
majority_class = max(set(train_labels), key=train_labels.count)
accuracy_majority = train_labels.count(majority_class) / len(train_labels)
print(f"Accuracy of the majority class classifier: {accuracy_majority:.3f}")

from sklearn.linear_model import LogisticRegression

def get_performance(train_embeddings, test_embeddings, tol=1e-3):
    clf = LogisticRegression(max_iter=100000, tol=tol)

    clf = clf.fit(train_embeddings, train_labels)
    y_pred = clf.predict(test_embeddings)

    n_trials = len(test_labels)
    n_correct = sum(y_pred == test_labels)

    accuracy = n_correct / n_trials
    lower_ci, upper_ci = clopper_pearson(n_correct, n_trials)
    return accuracy, (lower_ci, upper_ci)

accuracy, (lower_ci, upper_ci) = get_performance(train_embeddings, test_embeddings, tol=1e-4)

### Delete everything above

print(f"Accuracy of our classifier: {accuracy:.3f} ({lower_ci:.3f}, {upper_ci:.3f})")

Accuracy of the majority class classifier: 0.236
Accuracy of our classifier: 0.305 (0.242, 0.374)
