In [None]:
# This cell loads the dataset
import json
import numpy as np

top_k = 40
instances_num = 1000
text_prompt_all = [] # list of instances_num code prompts
codes_ground_truth_all = [] # list of instances_num code strings
scores_raw_all = np.zeros((instances_num, top_k))
codes_retrieved_all = [] # list of instances_num lists, each list has top_k items
row = 0
with open("/home/ubuntu/fancy_retriever/output_ret/train1000_codexglue.json") as f:
        data = json.load(f)
        for i in range(len(data)):
            text_prompt_all.append(data[i]["question"])
            codes_ground_truth_all.append(data[i]["answers"])
            scores_raw_all[row] = np.array([data[i]["ctxs"][j]["score"] for j in range(top_k)])
            codes_retrieved_all.append([data[i]["ctxs"][j]["text"] for j in range(top_k)])
            row += 1

# data has 1000 items, 
# for each item "data[i]" i in [0, 999], there are associated "question" (code prompt, str), "answers" (ground truth code, str), and a "ctxs"
# each "ctxs" is a list of dictionaries "data[i]["ctxs"][j]" j in [0, 199], each dictionary has 5 keys:
# ['id' (path, str), 'title' (str), 'text' (retrieved code, str), 'score' (similarity, float), 'has_answer' (bool)]

# check for json file loading
# print(0 in scores_raw_all) # False
# for i in range(len(codes_retrieved_all)): assert len(codes_retrieved_all[i]) == top_k
assert row == len(text_prompt_all) == len(codes_ground_truth_all) == len(codes_retrieved_all) == instances_num

In [None]:
# text prompt classification into easy, medium, hard
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm

# Define variables that should persist between runs
if 'tokenizer' not in globals() or 'model' not in globals():
    print("initializing tokenizer and mode...l")
    # model_name = "meta-llama/Llama-3.1-8B-Instruct" # Llama
    # model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" # DSR1qwen
    model_name = "Qwen/Qwen2.5-7B-Instruct-1M" # Qwen1M: top5; Qwen1Mlong: top20
    # model_name = "ministral/Ministral-3b-instruct" # mst3b
    hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN") # or insert your own huggingface access token here
    if hf_token is None:
        raise ValueError("Please set your HUGGINGFACE_HUB_TOKEN environment variable.")

    # Load tokenizer and model only once
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=hf_token,
        device_map="auto",
        torch_dtype=torch.float16
    )

# Now iterate over your data
results = []

instances_num = 200
for i in tqdm(range(instances_num)):
    text_prompt = text_prompt_all[i]
    augmented_prompt = f"Given the problem '{text_prompt}', please classify this given problem into easy, medium, or hard problem.\n"
    augmented_prompt += "When you are doing classification, you should consider that 60% of problems are easy, 35% of problems are medium, and 10% of problems are hard.\n"
    augmented_prompt += "You may refer to the following examples and their classification levels:\n"
    augmented_prompt += "easy problem: Determine color-category mapping. If color_column was specified, then map the category names to color values. Otherwise, use the palettable colors to automatically generate a set of colors for the group values.\n"
    augmented_prompt += "easy problem: Return a df with predictions and confidence interval.\n"
    augmented_prompt += "medium problem: convert fasta to phylip because RAxML is ridiculous.\n"
    augmented_prompt += "medium problem: scale table based on the column with the largest sum.\n"
    augmented_prompt += "hard problem: log processors that structlog executes before final rendering.\n"
    augmented_prompt += "easy problem: Call method to perform any setup.\n"
    augmented_prompt += "easy problem: get top hits after sorting by column number.\n"
    augmented_prompt += "medium problem: the final log processor that structlog requires to render.\n"
    augmented_prompt += "medium problem: execute jobs in processes using N threads.\n"
    augmented_prompt += "Your answer should only be the classification made for the given problem. For the given proble, you should only generate one word, which is either 'easy', or 'medium', or 'hard'.\n"
    ori_prompt = augmented_prompt
    
    # Tokenize with truncation
    # tokenized = tokenizer(augmented_prompt, return_tensors="pt", truncation=True, max_length=max_context_length)
    tokenized = tokenizer(augmented_prompt, return_tensors="pt")
    input_length = tokenized.input_ids.size(1)

    # Set up the generation pipeline
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    output = generator(
        augmented_prompt,
        max_new_tokens=16, # Set a reasonable generation limit
        temperature=0.1,             # Lower temperature for less randomness
        do_sample=True,
        num_return_sequences=1,    # Generate one sequence
    )

    generated_text = output[0]["generated_text"]
    generated_text = generated_text.split(ori_prompt)[1]
    # print("Generated Text:\n", generated_text)
    if "easy" in generated_text: 
        dynamic_k = 5
    elif "medium" in generated_text: 
        dynamic_k = 20
    elif "hard" in generated_text: 
        dynamic_k = 40
    results.append({
        "text_prompt": text_prompt,
        "classification": generated_text,
        "top_k": dynamic_k
    })

# Save the first 200 pairs in a JSON file.
with open("/home/ubuntu/fancy_retriever/Generation/classification_Qwen1M.json", "w") as f: json.dump(results, f, indent=2)

In [None]:
# read in the classification json file
import json
import numpy as np
import os

acc = 0
ground_truth_k = np.load("/home/ubuntu/fancy_retriever/Generation/converge.npy")
with open("/home/ubuntu/fancy_retriever/Generation/classification_Qwen1M_new.json") as f:
        data = json.load(f)
        for i in range(len(data)):
            dynamic_k = data[i]["top_k"]
            if ground_truth_k[i] <= 1 and dynamic_k == 5:
                acc += 1
            elif ground_truth_k[i] >= 2 and ground_truth_k[i] <= 3 and dynamic_k == 20:
                acc += 1
            elif ground_truth_k[i] == 4 and dynamic_k == 40:
                acc += 1
print("lite LLM text prompt classification accuracy: ", acc/instances_num)

In [None]:
# SCODE_G code generation re-implementation augmented with dynamic top_k retrieved examples
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm

# Determine whether to use a dynamic amount of retrieved code instances
Dynamic = True
if Dynamic:
    # Load the classification results
    with open("/home/ubuntu/fancy_retriever/Generation/classification_Qwen1M.json") as f:
        data = json.load(f)
        # Extract the top_k values from the classification results
        top_k_all = [item["top_k"] for item in data]

# Define variables that should persist between runs
if 'tokenizer' not in globals() or 'model' not in globals():
    # model_name = "meta-llama/Llama-3.1-8B-Instruct" # Llama
    # model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" # DSR1qwen
    model_name = "Qwen/Qwen2.5-7B-Instruct-1M" # Qwen1M: top5; Qwen1Mlong: top20
    hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN") # or insert your own huggingface access token here
    if hf_token is None:
        raise ValueError("Please set your HUGGINGFACE_HUB_TOKEN environment variable.")

    # Load tokenizer and model only once
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=hf_token,
        device_map="auto",
        torch_dtype=torch.float16
    )
    
    # Use the additional_special_tokens attribute to check if our special token is already added.
    special_token = "<|retrieved|>"
    if special_token not in tokenizer.additional_special_tokens:
        # Record the original size and then add exactly one token.
        original_vocab_size = len(tokenizer)
        tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
        model.resize_token_embeddings(original_vocab_size + 1)

# Now iterate over your data
results = []
max_context_length = 4096

instances_num = 200
for i in tqdm(range(instances_num)):
    text_prompt = text_prompt_all[i]       # Your i-th natural language prompt (str)
    retrieved_examples = codes_retrieved_all[i]   # The i-th list of code strings
    
    delimiter = f"\n{special_token}\n"
    
    # augmented_prompt = """"
    # def is_prime(n):
    # if n <= 1:
    #     return False
    # for i in range(2, int(n**0.5) + 1):
    # """
    # augmented_prompt = "Write a Python function that checks if a number is prime."

    augmented_prompt = f"Write a python function that {text_prompt} based on the following examples:\n"
    if Dynamic: # only use the dynamic_k retrieved examples
        dynamic_k = top_k_all[i]
        retrieved_examples = retrieved_examples[:dynamic_k]
    # Concatenate all retrieved examples using the delimiter.
    augmented_prompt += delimiter.join(retrieved_examples)
    # Concatenate
    augmented_prompt += f"""\n{special_token}
    #Requirements:
    1. Please do not repeat the above code snippets
    2. Please directly generated the code without any explanation or documnetation
    3. Please do not add any comments
    4. Please do not add any additional text
    5. Please do not import any libraries
    Here is your turn.\n
    """
    
    ori_prompt = augmented_prompt

    augmented_prompt += "```python \n"
    
    # Tokenize with truncation
    # tokenized = tokenizer(augmented_prompt, return_tensors="pt", truncation=True, max_length=max_context_length)
    tokenized = tokenizer(augmented_prompt, return_tensors="pt")
    input_length = tokenized.input_ids.size(1)
    # print(f"Input Length: {input_length}")
    allowed_new_tokens = max_context_length - input_length
    if allowed_new_tokens < 1:
        allowed_new_tokens = 1  # safeguard

    # Set up the generation pipeline
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    output = generator(
        augmented_prompt,
        max_new_tokens=256, # Set a reasonable generation limit
        temperature=0.1,             # Lower temperature for less randomness
        do_sample=True,
        num_return_sequences=1,    # Generate one sequence
    )

    generated_text = output[0]["generated_text"]
    generated_text = generated_text.split(ori_prompt)[1]
    # print("Generated Text:\n", generated_text)
    results.append({
        "text_prompt": text_prompt,
        "ground_truth_code": codes_ground_truth_all[i],
        "generated_code": generated_text
    })

# Save the first 200 pairs in a JSON file.
with open("/home/ubuntu/fancy_retriever/Generation/cxg200_Qwen1M_dynamic.json", "w") as f:
    json.dump(results, f, indent=2)

In [None]:
# SCODE_G code generation re-implementation augmented with top_k retrieved examples
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm

# Determine whether to use a dynamic amount of retrieved code instances
Dynamic = False
if Dynamic:
    # Load the classification results
    with open("/home/ubuntu/fancy_retriever/Generation/classification_Qwen1M.json") as f:
        data = json.load(f)
        # Extract the top_k values from the classification results
        top_k_all = [item["top_k"] for item in data]

# Define variables that should persist between runs
if 'tokenizer' not in globals() or 'model' not in globals():
    # model_name = "meta-llama/Llama-3.1-8B-Instruct" # Llama
    # model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" # DSR1qwen
    model_name = "Qwen/Qwen2.5-7B-Instruct-1M" # Qwen1M: top5; Qwen1Mlong: top20
    hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN") # or insert your own huggingface access token here
    if hf_token is None:
        raise ValueError("Please set your HUGGINGFACE_HUB_TOKEN environment variable.")

    # Load tokenizer and model only once
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=hf_token,
        device_map="auto",
        torch_dtype=torch.float16
    )
    
    # Use the additional_special_tokens attribute to check if our special token is already added.
    special_token = "<|retrieved|>"
    if special_token not in tokenizer.additional_special_tokens:
        # Record the original size and then add exactly one token.
        original_vocab_size = len(tokenizer)
        tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
        model.resize_token_embeddings(original_vocab_size + 1)

# Now iterate over your data
results = []
max_context_length = 4096

instances_num = 200
for i in tqdm(range(instances_num)):
    text_prompt = text_prompt_all[i]       # Your i-th natural language prompt (str)
    retrieved_examples = codes_retrieved_all[i]   # The i-th list of code strings
    
    delimiter = f"\n{special_token}\n"
    
    # augmented_prompt = """"
    # def is_prime(n):
    # if n <= 1:
    #     return False
    # for i in range(2, int(n**0.5) + 1):
    # """
    # augmented_prompt = "Write a Python function that checks if a number is prime."

    augmented_prompt = f"Write a python function that {text_prompt} based on the following examples:\n"
    if Dynamic: # only use the dynamic_k retrieved examples
        dynamic_k = top_k_all[i]
        retrieved_examples = retrieved_examples[:dynamic_k]
    # Concatenate all retrieved examples using the delimiter.
    augmented_prompt += delimiter.join(retrieved_examples)
    # Concatenate
    augmented_prompt += f"""\n{special_token}
    #Requirements:
    1. Please do not repeat the above code snippets
    2. Please directly generated the code without any explanation or documnetation
    3. Please do not add any comments
    4. Please do not add any additional text
    5. Please do not import any libraries
    Here is your turn.\n
    """
    
    ori_prompt = augmented_prompt

    augmented_prompt += "```python \n"
    
    # Tokenize with truncation
    # tokenized = tokenizer(augmented_prompt, return_tensors="pt", truncation=True, max_length=max_context_length)
    tokenized = tokenizer(augmented_prompt, return_tensors="pt")
    input_length = tokenized.input_ids.size(1)
    # print(f"Input Length: {input_length}")
    allowed_new_tokens = max_context_length - input_length
    if allowed_new_tokens < 1:
        allowed_new_tokens = 1  # safeguard

    # Set up the generation pipeline
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    output = generator(
        augmented_prompt,
        max_new_tokens=256, # Set a reasonable generation limit
        temperature=0.1,             # Lower temperature for less randomness
        do_sample=True,
        num_return_sequences=1,    # Generate one sequence
    )

    generated_text = output[0]["generated_text"]
    generated_text = generated_text.split(ori_prompt)[1]
    # print("Generated Text:\n", generated_text)
    results.append({
        "text_prompt": text_prompt,
        "ground_truth_code": codes_ground_truth_all[i],
        "generated_code": generated_text
    })

# Save the first 200 pairs in a JSON file.
with open("/home/ubuntu/fancy_retriever/Generation/cxg200_Qwen1M_top40.json", "w") as f:
    json.dump(results, f, indent=2)

In [None]:
# SCODE_G code generation re-implementation augmented with top_0 retrieved examples
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm

# Define variables that should persist between runs
if 'tokenizer' not in globals() or 'model' not in globals():
    # model_name = "meta-llama/Llama-3.1-8B-Instruct" # Llama
    # model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" # DSR1qwen
    model_name = "Qwen/Qwen2.5-7B-Instruct-1M" # Qwen1M: top5; Qwen1Mlong: top20
    hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN") # or insert your own huggingface access token here
    if hf_token is None:
        raise ValueError("Please set your HUGGINGFACE_HUB_TOKEN environment variable.")

    # Load tokenizer and model only once
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=hf_token,
        device_map="auto",
        torch_dtype=torch.float16
    )
    
    # Use the additional_special_tokens attribute to check if our special token is already added.
    special_token = "<|retrieved|>"
    if special_token not in tokenizer.additional_special_tokens:
        # Record the original size and then add exactly one token.
        original_vocab_size = len(tokenizer)
        tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
        model.resize_token_embeddings(original_vocab_size + 1)

# Now iterate over your data
results = []
max_context_length = 4096

instances_num = 200
for i in tqdm(range(instances_num)):
    text_prompt = text_prompt_all[i]       # Your i-th natural language prompt (str)
    retrieved_examples = codes_retrieved_all[i]   # The i-th list of code strings
    
    delimiter = f"\n{special_token}\n"
    
    # augmented_prompt = """"
    # def is_prime(n):
    # if n <= 1:
    #     return False
    # for i in range(2, int(n**0.5) + 1):
    # """
    # augmented_prompt = "Write a Python function that checks if a number is prime."
    """
    augmented_prompt = f"Write a python function that {text_prompt} based on the following examples:\n"
    # Concatenate all retrieved examples using the delimiter.
    augmented_prompt += delimiter.join(retrieved_examples)
    augmented_prompt += f\n{special_token}
    #Requirements:
    1. Please do not repeat the above code snippets
    2. Please directly generated the code without any explanation or documnetation
    3. Please do not add any comments
    4. Please do not add any additional text
    5. Please do not import any libraries
    Here is your turn.\n
    """
    augmented_prompt = f"Write a python function that {text_prompt}\n"
    ori_prompt = augmented_prompt

    augmented_prompt += "```python \n"
    
    # Tokenize with truncation
    # tokenized = tokenizer(augmented_prompt, return_tensors="pt", truncation=True, max_length=max_context_length)
    tokenized = tokenizer(augmented_prompt, return_tensors="pt")
    input_length = tokenized.input_ids.size(1)
    # print(f"Input Length: {input_length}")
    allowed_new_tokens = max_context_length - input_length
    if allowed_new_tokens < 1:
        allowed_new_tokens = 1  # safeguard

    # Set up the generation pipeline
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    output = generator(
        augmented_prompt,
        max_new_tokens=256, # Set a reasonable generation limit
        temperature=0.1,             # Lower temperature for less randomness
        do_sample=True,
        num_return_sequences=1,    # Generate one sequence
    )

    generated_text = output[0]["generated_text"]
    generated_text = generated_text.split(ori_prompt)[1]
    # print("Generated Text:\n", generated_text)
    results.append({
        "text_prompt": text_prompt,
        "ground_truth_code": codes_ground_truth_all[i],
        "generated_code": generated_text
    })

# Save the first 200 pairs in a JSON file.
with open("/home/ubuntu/fancy_retriever/Generation/cxg200_Qwen1M_top0.json", "w") as f:
    json.dump(results, f, indent=2)