In [1]:
# This cell loads the Code_RAG_Bench dataset (crb)
import json
import numpy as np

top_k = 5
instances_num = 500
text_prompt_all = [] # list of instances_num code prompts
codes_ground_truth_all = [] # list of instances_num code strings
scores_raw_all = np.zeros((instances_num, top_k))
codes_retrieved_all = [] # list of instances_num lists, each list has top_k items
row = 0
with open("/home/ubuntu/fancy_retriever/output_ret/coderagbench_500.json") as f:
        data = json.load(f)
        for i in range(len(data)):
            text_prompt_all.append(data[i]["question"])
            codes_ground_truth_all.append(data[i]["answers"])
            scores_raw_all[row] = np.array([data[i]["ctxs"][j]["score"] for j in range(top_k)])
            codes_retrieved_all.append([data[i]["ctxs"][j]["text"] for j in range(top_k)])
            row += 1

# data has 1000 items, 
# for each item "data[i]" i in [0, 999], there are associated "question" (code prompt, str), "answers" (ground truth code, str), and a "ctxs"
# each "ctxs" is a list of dictionaries "data[i]["ctxs"][j]" j in [0, 199], each dictionary has 5 keys:
# ['id' (path, str), 'title' (str), 'text' (retrieved code, str), 'score' (similarity, float), 'has_answer' (bool)]

# check for json file loading
# print(0 in scores_raw_all) # False
# for i in range(len(codes_retrieved_all)): assert len(codes_retrieved_all[i]) == top_k
assert row == len(text_prompt_all) == len(codes_ground_truth_all) == len(codes_retrieved_all) == instances_num

In [1]:
# This cell loads the Code_Search_Net subset from the CodeXGLUE dataset (csn)
import json
import numpy as np

top_k = 5
instances_num = 23107
text_prompt_all = [] # list of instances_num code prompts
codes_ground_truth_all = [] # list of instances_num code strings
scores_raw_all = np.zeros((instances_num, top_k))
codes_retrieved_all = [] # list of instances_num lists, each list has top_k items
row = 0
for file in range(1, 25):
    with open("/home/ubuntu/fancy_retriever/output_ret/retrieval_chunk_" + str(file) + "_top200_result.json") as f:
        data = json.load(f)
        for i in range(len(data)):
            text_prompt_all.append(data[i]["question"])
            codes_ground_truth_all.append(data[i]["answers"])
            scores_raw_all[row] = np.array([data[i]["ctxs"][j]["score"] for j in range(top_k)])
            codes_retrieved_all.append([data[i]["ctxs"][j]["text"] for j in range(top_k)])
            row += 1

# data has 1000 items, 
# for each item "data[i]" i in [0, 999], there are associated "question" (code prompt, str), "answers" (ground truth code, str), and a "ctxs"
# each "ctxs" is a list of dictionaries "data[i]["ctxs"][j]" j in [0, 199], each dictionary has 5 keys:
# ['id' (path, str), 'title' (str), 'text' (retrieved code, str), 'score' (similarity, float), 'has_answer' (bool)]

# check for json file loading
# print(0 in scores_raw_all) # False
# for i in range(len(codes_retrieved_all)): assert len(codes_retrieved_all[i]) == top_k
assert row == len(text_prompt_all) == len(codes_ground_truth_all) == len(codes_retrieved_all) == instances_num

In [None]:
# SCODE_G code generation re-implementation
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm

# Define variables that should persist between runs
if 'tokenizer' not in globals() or 'model' not in globals():
    model_name = "meta-llama/Llama-3.1-8B-Instruct"
    hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN") # or insert your own huggingface access token here
    if hf_token is None:
        raise ValueError("Please set your HUGGINGFACE_HUB_TOKEN environment variable.")

    # Load tokenizer and model only once
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=hf_token,
        device_map="auto",
        torch_dtype=torch.float16
    )
    
    # Use the additional_special_tokens attribute to check if our special token is already added.
    special_token = "<|retrieved|>"
    if special_token not in tokenizer.additional_special_tokens:
        # Record the original size and then add exactly one token.
        original_vocab_size = len(tokenizer)
        tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
        model.resize_token_embeddings(original_vocab_size + 1)

# Now iterate over your data
results = []
max_context_length = 4096

for i in tqdm(range(instances_num)):
    text_prompt = text_prompt_all[i]       # Your i-th natural language prompt (str)
    retrieved_examples = codes_retrieved_all[i]   # The i-th list of code strings
    
    delimiter = f"\n{special_token}\n"
    # augmented_prompt = """"
    # def is_prime(n):
    # if n <= 1:
    #     return False
    # for i in range(2, int(n**0.5) + 1):
    # """
    # augmented_prompt = "Write a Python function that checks if a number is prime."
    augmented_prompt = f"Write a python function that {text_prompt} based on the following examples:\n"
    # Concatenate all retrieved examples using the delimiter.
    augmented_prompt += delimiter.join(retrieved_examples)
    augmented_prompt += f"""\n{special_token}
    #Requirements:
    1. Please do not repeat the above code snippets
    2. Please directly generated the code without any explanation or documnetation
    3. Please do not add any comments
    4. Please do not add any additional text
    5. Please do not import any libraries
    Here is your turn.\n
    """

    ori_prompt = augmented_prompt

    augmented_prompt += "```python \n"
    
    # Tokenize with truncation
    tokenized = tokenizer(augmented_prompt, return_tensors="pt", truncation=True, max_length=max_context_length)
    input_length = tokenized.input_ids.size(1)
    # print(f"Input Length: {input_length}")
    allowed_new_tokens = max_context_length - input_length
    if allowed_new_tokens < 1:
        allowed_new_tokens = 1  # safeguard

    # Set up the generation pipeline
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    output = generator(
        augmented_prompt,
        max_new_tokens=256, # Set a reasonable generation limit
        temperature=0.1,             # Lower temperature for less randomness
        do_sample=True,
        num_return_sequences=1,    # Generate one sequence
    )

    generated_text = output[0]["generated_text"]
    generated_text = generated_text.split(ori_prompt)[1]
    # print("Generated Text:\n", generated_text)
    results.append({
        "text_prompt": text_prompt,
        "generated_text": generated_text
    })

# Save the first 100 pairs in a JSON file.
with open("/home/ubuntu/fancy_retriever/Generation/csn_Llama31_8B_Instruct.json", "w") as f:
    json.dump(results, f, indent=2)