<h3 style="text-align: center;"> Language Models Lab2</h3>
<h5 style="text-align: center;"> Wenjie Hu 343312</h5>



In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from random import randint
from contextlib import suppress
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product


# P1

This task is quite open-ended: you need to ’play the role’ of a researcher and examine how well a chosen language model (approximately 100M-1B in size) handles calculations of relatively simple arithmetic expressions in a few-shot learning scenario.
What constitutes an arithmetic expression, how complex it is, how prompts are constructed, and how examples are chosen is up to you; to earn 5 points, only two requirements must be met:
1. You must find a scenario that shows that the model’s results in the given task are significantly better than random.
2. The findings from your research should be thorough and documented in code/notebook form.
Additional points are awarded at the discretion of the instructors to recognize students who put exceptional effort into this task or achieved particularly interesting results. Up to 5 points are included in the maximum. An interesting question is whether any bias of the model can be observed in this task

In [None]:

def load_model(model_name, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

    def generate_response(prompt, max_new_tokens=16, **kwargs):
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            attention_mask=inputs.attention_mask,
            pad_token_id=tokenizer.eos_token_id,
            **kwargs,
        )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.removeprefix(prompt)

    return generate_response

class GPT2Calculator:
    def __init__(self, model_name="gpt2-medium", device="cpu"):
        self.model = load_model(model_name, device)
        self.prompt_template = """
        You are a calculator designed to solve mathematics problems. Compute the exact answer for the following:
        15 + 27 = 42
        8 + 3 = 11
        12 + 2 = 14
        100 + 25+= 125
        {} + {} =
        """.strip()

    def calculate(self, a, b):
        prompt = self.prompt_template.format(a, b)
        response = self.model(prompt, max_new_tokens=3, do_sample=False, temperature=0.0)
        with suppress(ValueError):
            return int(response.split()[0])
        return None


def evaluate_accuracy(calculator, num_samples=100, max_order=4):
    results = []

    for order, _ in product(range(1, max_order + 1), range(num_samples)):
        num_range = (2**(order - 1), 2**order)
        a, b = randint(*num_range), randint(*num_range)
        truth = a + b
        result = calculator.calculate(a, b)

        is_correct = result == truth
        results.append({
            "order": order,
            "a": a,
            "b": b,
            "truth": truth,
            "prediction": result,
            "is_correct": is_correct,
        })

    return pd.DataFrame(results)

def evaluate_random_baseline(num_samples=100, max_order=4):
    results = []
    for order, _ in product(range(1, max_order + 1), range(num_samples)):
        num_range = (2**(order - 1), 2**order)
        a, b = randint(*num_range), randint(*num_range)
        truth = a + b
        random_guess = randint(num_range[0] * 2, num_range[1] * 2)  # Random guess
        is_correct = random_guess == truth
        results.append({
            "order": order,
            "a": a,
            "b": b,
            "truth": truth,
            "random_guess": random_guess,
            "is_correct": is_correct,
        })

    return pd.DataFrame(results)


def summarize_results(df):
    summary = df.groupby("order").agg(
        accuracy=("is_correct", "mean"),
        total_samples=("is_correct", "size"),
    )
    summary["accuracy"] = summary["accuracy"] * 100
    return summary

if __name__ == "__main__":
    calculator = GPT2Calculator()
    num_samples = 100
    max_order = 10
    results_df = evaluate_accuracy(calculator, num_samples=num_samples, max_order=max_order)
    summary = summarize_results(results_df)
    # print(results_df)
    print("GPT-2 Model:")
    print(summary)
    random_results_df= evaluate_random_baseline(num_samples=num_samples, max_order=max_order)
    random_summary = summarize_results(random_results_df)
    print("=================================================")
    print("Random Result:")
    print(random_summary)




GPT-2 Model:
       accuracy  total_samples
order                         
1          74.0            100
2          67.0            100
3          30.0            100
4           1.0            100
5           1.0            100
6           5.0            100
7           0.0            100
8           0.0            100
9           0.0            100
10          0.0            100
Random Result:
       accuracy  total_samples
order                         
1          26.0            100
2          19.0            100
3          12.0            100
4           5.0            100
5           3.0            100
6           1.0            100
7           2.0            100
8           3.0            100
9           0.0            100
10          1.0            100


a little bit of comments of random guess method:  
  
order is the $n$ power of 2;  
num_range = $[2^{n-1},2^n]$  
a,b = randint(num_range[0]),randint(num_range[1])  
true_result = a+b  
random_guess_result = a random number between num_range[0]*2 and num_range[1]*2  

In [5]:
from random import randint

max_order = 10
num_range = (2**(max_order - 1), 2**max_order)
print("num_range:", num_range)
a, b = randint(*num_range), randint(*num_range)
true_answer = a + b
print(f"True answer: {a} + {b} = {true_answer}")
random_guess = randint(num_range[0] * 2, num_range[1] * 2)
print(f"Random guess: {random_guess}")

num_range: (512, 1024)
True answer: 545 + 710 = 1255
Random guess: 1303


# P2

In this task, you need to modify the token generation procedure in such a way that:
1. It generates exactly one word from a given set of words (this word may consist of more than one token), and
2. It is sufficiently efficient (capable of generating many continuations of a prefix in a reasonably short time).
Naturally, this may require generating a non-zero number of redundant tokens. Optional part:.Then use this procedure to solve the Riddles task1 from the AI Olympiad in the following way: create a prompt consisting of a riddle and some connector, then generate (several times?) possible one-word continuations of such a prompt (these continuations should, of course, come from the set of
possible answers). Provide the accuracy of this solution (for more than one model).
Note: This task will have a continuation, where points will be awarded for the achieved results

https://github.com/OlimpiadaAI/I-OlimpiadaAI/blob/main/first_stage/riddles/zagadki.ipynb

https://www.kaggle.com/datasets/prajwaldongre/riddles-a-synthetic-riddle-dataset-for-nlp?resource=download


In [1]:
!pip install gdown



In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from collections import Counter
import torch.nn.functional as F
import pandas as pd

In [3]:
class TokenTrie(dict):
    #restrict token from predefined wordsets.
    def __init__(self, sequences=None):
        super().__init__()
        self.terminal = False
        if sequences:
            for sequence in sequences:
                self.insert(sequence)

    def insert(self, sequence):
        # inserts a token sequence into the trie.
        if not sequence:
            self.terminal = True
            return self

        head, *tail = sequence
        if head not in self:
            self[head] = TokenTrie()
        self[head].insert(tail)
        return self

In [6]:
class OneWordGenerator:
    def __init__(self, model, tokenizer, words: list[str], device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        tokenized_words = [tokenizer(word, add_special_tokens=False).input_ids for word in words]
        self.trie = TokenTrie(tokenized_words)

    def restricted_sampling(self, input_ids, whitelist):

        with torch.no_grad():
            outputs = self.model(torch.tensor([input_ids], device=self.device))
            next_token_logits = outputs.logits[:, -1, :]

        probs = F.softmax(next_token_logits, dim=-1).squeeze()
        mask = torch.ones_like(probs, device=self.device)
        mask[whitelist] = 0
        probs.masked_fill_(mask.bool(), 0)
        probs /= probs.sum()
        # sample a token from the masked distribution
        return torch.multinomial(probs, num_samples=1).item()

    def __call__(self, prefix: str):
        # generates a word based on the prefix and is constrained by the trie.
        input_ids = self.tokenizer(prefix, add_special_tokens=False)["input_ids"]
        trie = self.trie

        while True:
            whitelist = list(trie.keys())
            if not whitelist:
                break

            sampled_token = self.restricted_sampling(input_ids, whitelist)
            input_ids.append(sampled_token)
            trie = trie[sampled_token]

        generated_sequence = self.tokenizer.decode(input_ids, skip_special_tokens=True)
        return generated_sequence[len(prefix):].strip()


if __name__ == "__main__":
    model_name = "gpt2"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    generator = OneWordGenerator(model, tokenizer, [], device=device)
    test_cases = [
        {"prefix": "The weather tomorrow will be", "wordset": ["sunny", "fish", "cat", "cow", "phone", "snowy"]},
        {"prefix": "My favorite food is", "wordset": ["pizza", "water", "sushi", "bag", "burger"]},
        {"prefix": "The color of the car is", "wordset": ["October", "fan", "green", "cheese", "coffe", "black"]},
        {"prefix": "The zoo's most popular animal is a", "wordset": ["lion", "tiger", "elephant", "giraffe", "panda", "zebra"]},
        {"prefix": "The most sport in my Uni is", "wordset": ["pizza", "beer", "tennis", "cricket", "baseball", "hockey"]},
    ]

    iterations = 100
    for i, case in enumerate(test_cases):
        prefix = case["prefix"]
        wordset = case["wordset"]
        generator = OneWordGenerator(model, tokenizer, wordset, device=device)

        frequencies = Counter()
        for _ in range(iterations):
            result = generator(prefix)
            frequencies[result] += 1
        print(f"Test {i + 1}:")
        print(f"Prefix: {prefix}")
        print(f"Wordset: {wordset}")
        print("Generated Word Frequencies:")
        for word, count in frequencies.items():
            print(f"  {word}: {count} times ({(count / iterations) * 100:.2f}%)")
        print("-" * 50)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Test 1:
Prefix: The weather tomorrow will be
Wordset: ['sunny', 'fish', 'cat', 'cow', 'phone', 'snowy']
Generated Word Frequencies:
  snowy: 88 times (88.00%)
  sunny: 6 times (6.00%)
  cat: 3 times (3.00%)
  fish: 3 times (3.00%)
--------------------------------------------------
Test 2:
Prefix: My favorite food is
Wordset: ['pizza', 'water', 'sushi', 'bag', 'burger']
Generated Word Frequencies:
  burger: 7 times (7.00%)
  pizza: 45 times (45.00%)
  sushi: 41 times (41.00%)
  water: 6 times (6.00%)
  bag: 1 times (1.00%)
--------------------------------------------------
Test 3:
Prefix: The color of the car is
Wordset: ['October', 'fan', 'green', 'cheese', 'coffe', 'black']
Generated Word Frequencies:
  green: 49 times (49.00%)
  black: 24 times (24.00%)
  cheese: 20 times (20.00%)
  coffe: 7 times (7.00%)
--------------------------------------------------
Test 4:
Prefix: The zoo's most popular animal is a
Wordset: ['lion', 'tiger', 'elephant', 'giraffe', 'panda', 'zebra']
Generated W

# P3

In this task, we will work on disambiguating text. We assume that the text with variants is given as follows:
- smart—smarty—small students—elephants—houses from—farm—firm—forum—fermi—fram—free
- many—yemen—mania—man—mine—mean—money countries—cantors—counters—centaurs—contours—centers
- study—stud—steady—suited—studio— at—taiyo—tye—tae—tay—ate—to—auto—yeti
- our—aria—euro university—universe—universal
  
Tokenization is performed using the split method, and (for convenience) the correct word is always in the first position (but you must not use this fact in any way).
Write a program that finds the correct word variant. Your program should use language models and some non-greedy optimization algorithm. This could be, for example, beam search, but other methods are also allowed.

In [37]:
def probs_from_logits(logits, labels):
    logp = F.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label


def sentence_prob(sentence_txt, model, tokenizer, device):
    input_ids = tokenizer(sentence_txt, return_tensors="pt")["input_ids"].to(device)
    with torch.no_grad():
        output = model(input_ids=input_ids)
        log_probs = probs_from_logits(output.logits[:, :-1, :], input_ids[:, 1:])
        seq_log_probs = torch.sum(log_probs)
    return seq_log_probs.cpu().item()


def beam_search(k, corpus, model, tokenizer, device):
    beam = corpus[0]
    for words in corpus[1:]:
        # expand each current beam prefix with all words
        expanded = [
            f"{prefix} {w}" for prefix in beam for w in words
        ]
        beam = sorted(expanded, key=lambda s: sentence_prob(s, model, tokenizer, device), reverse=True)[:k]
        print("Current Beam:", beam)

    return beam


if __name__ == "__main__":
    model_name = "gpt2"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    input_text = (
        "smart—smarty—small students—elephants—houses "
        "from—farm—firm—forum—fermi—fram—free "
        "many—yemen—mania—man—mine—mean—money "
        "countries—cantors—counters—centaurs—contours—centers "
        "study—stud—steady—suited—studio— "
        "at—taiyo—tye—tae—tay—ate—to—auto—yeti "
        "our—aria—euro university—universe—universal"
    )

    corpus = [group.split("—") for group in input_text.split()]
    k = 3
    best_sequences = beam_search(k, corpus, model, tokenizer, device)
    print("\nBest Sequences:")
    for seq in best_sequences:
        print(seq)


Current Beam: ['small students', 'smart students', 'small houses']
Current Beam: ['small students from', 'smart students from', 'small houses from']
Current Beam: ['small students from many', 'smart students from many', 'small students from mine']
Current Beam: ['smart students from many countries', 'small students from many countries', 'small students from many centers']
Current Beam: ['smart students from many countries study', 'small students from many countries study', 'smart students from many countries ']
Current Beam: ['smart students from many countries study at', 'small students from many countries study at', 'smart students from many countries study to']
Current Beam: ['smart students from many countries study at our', 'small students from many countries study at our', 'smart students from many countries study to our']
Current Beam: ['small students from many countries study at our university', 'smart students from many countries study at our university', 'smart students from

# P4

https://github.com/OlimpiadaAI/I-OlimpiadaAI/blob/main/first_stage/riddles/zagadki.ipynb

In this task, we will generate sentences in natural language that meet an additional property: all words should start with the same letter. The generation should be prompted using randomly chosen word (is up to you how to design this process). Your generations should end with a period (or another punctuation mark ending the sentence).
Your program should:
1. Use both top-k and top-p sampling.
2. Modify the token probability distribution.
3. Generate ¨proper¨texts, meaning words consisting of letters, with spaces as separators, and
properly formatted punctuation (e.g., commas should be attached to the preceding word).
4. Avoid repetitions.
5. Generate multiple variants and select the best one according to a criterion defined by you.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor
import torch

In [None]:
class StrictStartingLetterProcessor(LogitsProcessor):
    def __init__(self, tokenizer, letter):
        self.tokenizer = tokenizer
        self.letter = letter.lower()

    def __call__(self, input_ids, scores):
        #only allow tokens starting with the specified letter.
        for i in range(scores.size(1)):
            token = self.tokenizer.decode([i]).strip().lower()
            if not token.startswith(self.letter):
                scores[0, i] = -float("inf")
        return scores


def generate_sentence_with_custom_logits(
    prefix, max_length=50, top_k=50, top_p=0.9, temperature=0.7, device="cuda"
):
    #all words start with the same letter.
    letter = prefix[0].lower()
    inputs = tokenizer(prefix, return_tensors="pt", padding=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    logits_processor = [StrictStartingLetterProcessor(tokenizer, letter)]

    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=1.8,
        do_sample=True,
        logits_processor=logits_processor,
        pad_token_id=tokenizer.pad_token_id,
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    if not generated_text.endswith(('.', ';', '?', '!')):
        generated_text += "."
    return generated_text.capitalize()

def generate_best_sentence_with_filter(
    prefix, max_length=50, top_k=50, top_p=0.9, temperature=0.7, n_variants=5, device="cuda"
):
    candidates = [
        generate_sentence_with_custom_logits(
            prefix, max_length, top_k, top_p, temperature, device=device
        )
        for _ in range(n_variants)
    ]

    best_sentence = max(candidates, key=len)
    return best_sentence

def test_prefixes(
    prefixes,
    max_length=20,
    top_k=50,
    top_p=0.9,
    temperature=0.7,
    n_variants=3,
    device="cuda"
):
    results = {}
    for prefix in prefixes:
        best_sentence = generate_best_sentence_with_filter(
            prefix,
            max_length=max_length,
            top_k=top_k,
            top_p=top_p,
            temperature=temperature,
            n_variants=n_variants,
            device=device,
        )
        results[prefix] = best_sentence

    return results

if __name__ == "__main__":
    test_prefixes_list = [
        "Awesome",
        "Brilliant",
        "Creative",
        "Wonderful",
        "Quick",
        "Strong",
    ]

    results = test_prefixes(
        prefixes=test_prefixes_list,
        max_length=20,
        top_k=50,
        top_p=0.9,
        temperature=0.7,
        n_variants=3,
        device="cuda" if torch.cuda.is_available() else "cpu",
    )

    for prefix, sentence in results.items():
        print(f"{prefix}: {sentence}")


Awesome: Awesome and all around awesome app as always are available at a affordable amount also adding an amazing addition anytime.
Brilliant: Brilliant british band bblb began by blasting backbeat beatboards before beginning blisterbeb.
Creative: Creative commons cc click creative commons credential checklist can create creative content creation challenges creatively creating collaborative code coverage.
Wonderful: Wonderful with well written words which will work wonders when working without writing while waiting within window windows where.
Quick: Quick question quickly questions quite quick query queries quqqueryquery query quqsq question quick quizquick.
Strong: Strong said she still saw some signs suggesting someone should step sideways so something similar seemed suspicious somewhere south side.
