<h3 style="text-align: center;"> Language Models Lab 1</h3>
<h5 style="text-align: center;"> Wenjie Hu 343312</h5>



### P1

In [None]:
import torch
import itertools
from itertools import product
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2Tokenizer, GPT2LMHeadModel

In [3]:
model_name = "gpt2" 
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model.config.pad_token_id = model.config.eos_token_id if model.config.eos_token_id is not None else 50256

system_prompt = """
Introduction:
The user is contacting the assistant. The assistant is helpful and responds shortly.
Dialogue:"""

print()
while (user_prompt := input().strip()):

    prompt = f"""{system_prompt}
        User: \"{user_prompt}\"
        Assistant: \""""

    model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
    
    generated_ids = model.generate(
      **model_inputs,
      max_new_tokens=100,
      do_sample=True,
      penalty_alpha=0.6,  
      top_k=10, 
      pad_token_id=model.config.pad_token_id 
    )

    output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    print(30 * '-', flush=True)
    print(f"User: \"{user_prompt}\"", flush=True)
    response = output.removeprefix(prompt).strip().replace("\\n", "\n").split("\n")[0].split("\"")[0] + "\""
    system_prompt = prompt + response
    print(f"GPT: \"{response}", flush=True)



------------------------------
User: "hello, how are you?"
GPT: "I'm just sitting here looking at the news."
------------------------------
User: "Do you like Sushi"
GPT: "I don't like Sushi"
------------------------------
User: "What's your favourite sport?"
GPT: "I think I like to swim. I like to go to a beach. I don't like to eat meat. I don't like to do things on my own."
------------------------------
User: "The weather is good today"
GPT: "Well, it's good for me now to be able to go to the beach tomorrow morning."
------------------------------
User: "Do you like the language model course?"
GPT: "I like the language model course. It's really good to know the basics so I don't want to do too many things and I like the environment a lot."


### P2

In [6]:

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()

def score_sentence(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss.item()
    return loss

def sentence_prob(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss.item()
    return -loss

def format_sentence(words):
    sentence = ' '.join(words)
    return sentence.capitalize() + "."

def generate_permutations(words):
    permutations = list(itertools.permutations(words))
    scored_sentences = []
    for perm in permutations:
        sentence = format_sentence(perm)
        score = score_sentence(sentence, model, tokenizer)
        scored_sentences.append((sentence, score))
    scored_sentences.sort(key=lambda x: x[1])
    return scored_sentences

In [7]:
words = "John likes spinach very much".split()
# words = "Squirrels live in the park".split()
print("=== All Permutations ===")
permutations = generate_permutations(words)
for sentence, score in permutations[:10]:
    print(f"{sentence} (Score: {score})")

=== All Permutations ===
John very much likes spinach. (Score: 5.47634220123291)
John likes spinach very much. (Score: 5.814343452453613)
John likes very much spinach. (Score: 5.995514392852783)
Spinach very much likes john. (Score: 6.055178642272949)
Spinach likes john very much. (Score: 6.245352268218994)
Spinach john likes very much. (Score: 6.416581630706787)
Spinach likes very much john. (Score: 6.430891990661621)
Spinach john very much likes. (Score: 6.470345973968506)
John spinach likes very much. (Score: 7.153365135192871)
Spinach very much john likes. (Score: 7.340554714202881)


In [8]:

def sentence_prob(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss.item()
    return -loss 

def test_synergy(w, v, wv):
    return sentence_prob(wv) > 0.85 * (sentence_prob(w) + sentence_prob(v))

def synergize(words):
    synergies = filter(lambda args: test_synergy(*args) and args[0] != args[1],
                       ((w, v, f"{w} {v}") for w, v in product(words, words)))
    
    return max(synergies, key=lambda args: sentence_prob(args[-1]), default=None)

def chunk_and_permute(words):
    words = set(words)
    while (s := synergize(words)): 
        w, v, wv = s
        words.remove(w)
        words.remove(v)
        words.add(wv)  # Merge the best pair
    
    final_permutations = list(itertools.permutations(words))
    scored_permutations = []
    for perm in final_permutations:
        sentence = " ".join(perm)
        score = sentence_prob(sentence)
        scored_permutations.append((sentence, score))
    
    scored_permutations.sort(key=lambda x: x[1], reverse=True)
    return scored_permutations[:10]


In [9]:
words = "Last night, I met a wonderful woman".split()
# words = "Last night, I met a wonderful woman who passionately talked about language model".split()
top_sentences = chunk_and_permute(words)
for sentence, score in top_sentences:
    print(f"{sentence} (Score: {score})")


Last wonderful night, I met a woman (Score: -4.553354263305664)
Last wonderful night, a woman I met (Score: -4.964489459991455)
Last wonderful night, met a woman I (Score: -5.478196620941162)
wonderful night, Last I met a woman (Score: -5.555985927581787)
woman Last wonderful night, I met a (Score: -5.566989421844482)
Last a wonderful night, woman I met (Score: -5.596669673919678)
Last a wonderful night, I met woman (Score: -5.679810523986816)
Last woman I met a wonderful night, (Score: -5.73226261138916)
Last I met a woman wonderful night, (Score: -5.784866809844971)
Last wonderful night, a woman met I (Score: -5.78619384765625)


## p3

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.nn import functional as F

In [2]:
MODEL_NAME = 'flax-community/papuGaPT2'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

In [3]:
def cal_log_probs(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
    log_probs = F.log_softmax(logits, dim=-1)
    label_log_probs = torch.gather(log_probs, 2, labels.unsqueeze(2)).squeeze(-1)
    return label_log_probs

def cal_accuracy(classifier_function) -> float:
    data = []
    with open("reviews_for_task3.txt", "r") as file:
        for line in file:
            label, *review_tokens = line.strip().split()
            review = " ".join(review_tokens)
            prediction = classifier_function(review)
            data.append(label == prediction)
    
    return sum(data) / len(data)

one approach is combine review with the label and get its log probability


In [4]:
def classify_review_1(review: str) -> str:
    labels = ["Opinia jest pozytywna.", "Opinia jest negatywna."]
    probs = {}

    for label in labels:
        input_text = f"{review} {label}"
        input_ids = tokenizer(input_text, return_tensors='pt')['input_ids'].to(DEVICE)
        
        with torch.no_grad():
            output = model(input_ids=input_ids)
            log_probs = cal_log_probs(output.logits[:, :-1, :], input_ids[:, 1:])
            probs[label] = torch.sum(log_probs).item()

    return "GOOD" if probs[labels[0]] > probs[labels[1]] else "BAD"

print(f"Classifier Accuracy: {cal_accuracy(classify_review_1) * 100:.2f}%")


Classifier Accuracy: 78.75%


another appoarch is classify based on the average log probability


In [5]:
# classify based on the average log probability
def classify_review_2(review: str, threshold: float = 0.5) -> str:

    input_ids = tokenizer(review, return_tensors='pt')['input_ids'].to(DEVICE)
    
    with torch.no_grad():
        output = model(input_ids=input_ids)
        log_probs = cal_log_probs(output.logits[:, :-1, :], input_ids[:, 1:])
        average_log_prob = torch.mean(log_probs).item()
    return "GOOD" if average_log_prob > threshold else "BAD"

print(f"Classifier Accuracy: { cal_accuracy(classify_review_2) * 100:.2f}%")


Classifier Accuracy: 50.00%


## p4

not able to solve this problem, may idea is right, but the code is wrong.  

In [6]:
import torch
import random
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

2024-12-11 21:15:55.312197: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
# model_name = "eryk-mazus/polka-1.1b"
# tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
# model.config.pad_token_id = tokenizer.pad_token_id
# model.eval()


In [9]:
model_name = "gpt2"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model.config.pad_token_id = model.config.eos_token_id if model.config.eos_token_id is not None else 50256

text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

def read_files(question_file, answer_file):  
    with open(question_file, 'r', encoding='utf-8') as qf:  
        questions = [line.strip() for line in qf.readlines()]  
    with open(answer_file, 'r', encoding='utf-8') as af:  
        answers = [line.strip() for line in af.readlines()]  
    return questions, answers  

def generate_answers(questions, model_name, max_length=128):  
    # Reuse the existing tokenizer and model with text generation instead of QA
    answers = []
    for question in questions:
        result = text_generator(question, max_length=max_length, truncation=True)
        answer = result[0]['generated_text'] if result else "无法回答"
        answers.append(answer)
    return answers

# Heuristic answer generation based on pre-defined questions
def heuristic_answers(questions):  
    heuristic_dict = {  
    "What is the capital of Poland?": "Warsaw",
    "Is the Earth flat?": "No, the Earth is round",
    "What is 2 + 2?": "4",
    "Which planet is closest to the Sun?": "Mercury",
    "Who was the first president of the United States?": "George Washington",
    "Which country is the largest by area?": "Russia", 
    "What is the capital of France?": "Paris"
    }
    answers = []
    for question in questions:
        if question in heuristic_dict:
            answers.append(heuristic_dict[question])
        else:
            answers.append("not sure（heuristic）")
    return answers

def probabilistic_answers(questions):  
    answers = []
    for question in questions:
        if random.random() > 0.5:
            answer = "yes" if "?" in question else "no"
        else:
            answer = "not sure"
        answers.append(answer)
    return answers

def main():  
    question_file = './task4_questions.txt'  
    answer_file = './task4_answers.txt'  
    questions, _ = read_files(question_file, answer_file)
    
    # Generate answers using language model  
    lm_answers = generate_answers(questions, model_name)  
  
    heuristic_questions = questions[:5]  
    heuristic_answers_list = heuristic_answers(heuristic_questions)  
  
    probabilistic_questions = questions[-5:]  
    probabilistic_answers_list = probabilistic_answers(probabilistic_questions)  
  
    final_answers = lm_answers.copy()  
    for i, q in enumerate(questions):  
        if q in heuristic_questions:  
            final_answers[i] = heuristic_answers_list[heuristic_questions.index(q)]  
        elif q in probabilistic_questions:  
            final_answers[i] = probabilistic_answers_list[probabilistic_questions.index(q)]  

    questions = ["What is the capital of France?", "Who was the first president of the United States?"]
    answers = generate_answers(questions, model_name)
    for q, a in zip(questions, answers):
        print(f"Question: {q}\nAnswer: {a}\n")

if __name__ == "__main__":
    main()


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


KeyboardInterrupt: 