In [1]:
import pandas as pd
import requests
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import os
import ast

In [2]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

max_seq_length = 2048
lora_rank       = 8

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name            = 'sft_saved_lora_3B_sft',
    max_seq_length        = max_seq_length,
    load_in_4bit          = True,    # 4‑bit base + LoRA
    fast_inference        = True,
    max_lora_rank         = lora_rank,
    gpu_memory_utilization= 0.9,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-22 00:45:24 __init__.py:190] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.7.2.
   \\   /|    Tesla V100-PCIE-32GB. Num GPUs = 1. Max memory: 31.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Your GPU does not support prefix caching - will disable!
Unsloth: vLLM loading unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 89.03%
Unsloth: Your GPU has CUDA compute capability 7.0 with VRAM = 31.73 GB.
Unsl



INFO 04-22 00:45:41 weight_utils.py:252] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-22 00:45:43 model_runner.py:1115] Loading model weights took 2.2405 GB
INFO 04-22 00:45:43 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-22 00:45:45 worker.py:267] Memory profiling takes 2.00 seconds
INFO 04-22 00:45:45 worker.py:267] the current vLLM instance can use total_gpu_memory (31.73GiB) x gpu_memory_utilization (0.89) = 28.25GiB
INFO 04-22 00:45:45 worker.py:267] model weights take 2.24GiB; non_torch_memory takes 0.07GiB; PyTorch activation peak memory takes 1.49GiB; the rest of the memory reserved for KV Cache is 24.45GiB.
INFO 04-22 00:45:45 executor_base.py:110] # CUDA blocks: 14309, # CPU blocks: 3510
INFO 04-22 00:45:45 executor_base.py:115] Maximum concurrency for 2048 tokens per request: 111.79x
INFO 04-22 00:45:49 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error

Capturing CUDA graph shapes: 100%|██████████| 43/43 [00:31<00:00,  1.36it/s]

INFO 04-22 00:46:21 model_runner.py:1562] Graph capturing finished in 32 secs, took 0.76 GiB
INFO 04-22 00:46:21 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 37.81 seconds



Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [3]:
# PROMPTING FUNCTION
def llm_output(words):
  prompt = '''You are playing the NY Times Connections game. Your task is to categorize 16 given words into exactly 4 groups of 4 words each, based on shared common themes.

Solve the puzzle using these clear steps:

1. THINK STEP-BY-STEP: Begin by carefully analyzing the words within <thinking> tags. Identify their meanings, relationships, and possible groupings logically.
Example:
<thinking>To solve this, I will first look for obvious groupings like NBA teams, palindrome words, weather-related words, or computer keys. Then, I'll group these words accordingly and ensure each word belongs to exactly one group.</thinking>

2. PROVIDE FINAL ANSWER: After clearly grouping and justifying all four sets, provide ONLY your final solution within <answer> tags. Format your solution exactly as shown below.

Example:
<answer>
[['HAIL', 'RAIN', 'SLEET', 'SNOW'],
 ['BUCKS', 'HEAT', 'JAZZ', 'NETS'],
 ['OPTION', 'RETURN', 'SHIFT', 'TAB'],
 ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']]
</answer>

Important Notes:
- Categories should be specific
- Words cannot appear in more than one group.
- Categories can include compound words, shared prefixes/suffixes, pop culture references, or common phrases.
- DO NOT ADD NEW WORDS THAT ARE NOT MENTIONED IN THE QUESTION. USE ONLY WORDS MENTIONED AND GROUP THEM

Here is an example:

USER: [BUCKS, HAIL, JAZZ, SHIFT, LEVEL, MOM, SNOW, RACECAR, SLEET, TAB, KAYAK, RETURN, OPTION, NETS, RAIN, HEAT]

<answer>
[['HAIL', 'RAIN', 'SLEET', 'SNOW'],
 ['BUCKS', 'HEAT', 'JAZZ', 'NETS'],
 ['OPTION', 'RETURN', 'SHIFT', 'TAB'],
 ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']]
</answer>

Explanation:
- WEATHER TERMS: 'HAIL', 'RAIN', 'SLEET', 'SNOW'
- NBA TEAMS: 'BUCKS', 'HEAT', 'JAZZ', 'NETS'
- KEYBOARD KEYS: 'OPTION', 'RETURN', 'SHIFT', 'TAB'
- PALINDROMES: 'KAYAK', 'LEVEL', 'MOM', 'RACECAR'
Give the answer for the following set of words -''' + str(words)

  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

  output_ids = model.generate(**inputs, max_length=1024, temperature=0.7, do_sample=True)

  response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

  # print("\nResponse:", response)
  return response

In [4]:
# EVALUATION FUNCTION
def evaluation_scores(actual_clusters, predicted_clusters):
  if len(actual_clusters) == 0 or len(predicted_clusters) == 0:
    return 0
  score_matrix = np.zeros((len(actual_clusters), len(predicted_clusters)))
  jacc_similarity_score = 0

  # Evauating jaccard matching between each pair of sets
  for actual_idx in range(4):
    for predicted_idx in range(4):
      actual_group = actual_clusters[actual_idx]
      predicted_group = predicted_clusters[predicted_idx]
      score_matrix[actual_idx, predicted_idx] = len(set(actual_group) & set(predicted_group)) / len(set(actual_group) | set(predicted_group))

  # Success Rate
  succ_rate = np.sum(np.array(score_matrix) == 1.0)

  total_words = sum(len(cluster) for cluster in predicted_clusters)  # Should be 16 in this case
  max_overlaps = 0
  for pred_cluster in predicted_clusters:
      overlap_counts = Counter()
      for true_cluster in actual_clusters:
          overlap_counts[tuple(true_cluster)] = len(set(pred_cluster) & set(true_cluster))

      max_overlaps += max(overlap_counts.values())  # Take the largest overlap for each predicted cluster
  # print("SUCCESS RATE: ", succ_rate)
  return succ_rate , max_overlaps / total_words

def evaluation(actual_clusters_set, predicted_clusters_set):
  succ_rate = 0
  purity = 0
  num_samples = 0
  for actual_clusters, predicted_clusters in zip(actual_clusters_set, predicted_clusters_set):
    num_samples += 1
    if len(predicted_clusters) >0:
    # print("ACTUAL CLUSTER:",actual_clusters, " \nPREDICTED CLUSTER: ", predicted_clusters, "\n")
    val1, val2 = evaluation_scores(actual_clusters, predicted_clusters)
    succ_rate += val1
    purity += val2
  print(num_samples)
  return succ_rate/num_samples, purity/num_samples

IndentationError: expected an indented block after 'if' statement on line 35 (812227591.py, line 37)

In [None]:
file_path = "/home1/nageshs/ConnectionsPuzzle/data/connections.csv"
df = pd.read_csv(file_path)

train_df, test_df = train_test_split(df, test_size=0.1, random_state=3407)
df = test_df

df.head()

In [None]:
df["parsed_question"] = df["question"].apply(lambda x: [w.strip() for w in x.split(",")])
df["parsed_answers"] = df["answers"].apply(ast.literal_eval)

In [None]:
from tqdm import tqdm
import ast
import re

def extract_last_answer(response):
    try:
        all_answers = re.findall(r"<answer>(.*?)</answer>", response, re.DOTALL)
        if not all_answers:
            return []
        return ast.literal_eval(all_answers[-1].strip())
    except Exception:
        return []

predicted_clusters = []
actual_clusters = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating"):
    response = llm_output(row["parsed_question"])
    predicted = extract_last_answer(response)

    predicted_clusters.append(predicted)
    actual_clusters.append(row["parsed_answers"])

    print("WORDS:", row['parsed_question'])
    print("ACTUAL:", row['parsed_answers'])
    print("PREDICTED:", predicted)

succ_rate, purity = evaluation(actual_clusters, predicted_clusters)
print("Success Rate:", succ_rate)
print("Purity:", purity)
