In [1]:
from langchain_community.document_loaders import JSONLoader
from tqdm.auto import tqdm
import json
import pickle
from langchain.text_splitter import Language
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

In [2]:
with open("../data/chronicle/rag_msg.pkl", "rb") as f:
    msg_data = pickle.load(f)

diff_loader = JSONLoader(
    file_path='../data/chronicle/chronicle_rag_db.json',
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

In [3]:
# 1-1000 data use java splitter, 1000-2000 use cpp splitter, 2000-3000 use csharp splitter, 3000-4000 use python splitter, 4000-5000 use javascript splitter


# languages = [Language.JAVA, Language.CPP, Language.CSHARP, Language.PYTHON, Language.JS]
languages = [Language.PYTHON, Language.JS]
splitters = [RecursiveCharacterTextSplitter.from_language(language, chunk_size=500, chunk_overlap=50) for language in languages]

# language_dict = {'java': 0, 'c++': 1, 'c#': 2, 'python': 3, 'javascript': 4}
language_dict = {'Python': 0, 'JavaScript': 1}

In [4]:
# Define the path to the pre-trained model you want to use
modelPath = "mixedbread-ai/mxbai-embed-large-v1"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cuda', 'trust_remote_code': True}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    cache_folder = '../models',
    
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [5]:
def similarity_search_top_4(documents):
    # Initialize an empty dictionary to store aggregate scores for each candidate ID
    aggregate_scores = {}

    # Iterate through each document in the documents list
    for document in documents:
        # Apply similarity search function to the document
        results = db.similarity_search_with_relevance_scores(document.page_content, score_threshold=0.0)
        # Iterate through the results for each document
        for candidate_doc in results:
            id = candidate_doc[0].metadata['seq_num']
            score = candidate_doc[1]
            # Update the aggregate score for the candidate ID
            aggregate_scores[id] = aggregate_scores.get(id, 0) + score

    # Find the top 4 candidate IDs with the highest aggregate scores
    top_candidates = sorted(aggregate_scores.items(), key=lambda x: x[1], reverse=True)[:4]

    # Extract the IDs of the top 4 candidates
    top_candidate_ids = [id for id, score in top_candidates]

    return top_candidate_ids

In [8]:
test_diff_loader = JSONLoader(
    # file_path='../data/chronicle/rag_baseline/rag_baseline_python.json',
    file_path='../data/scenario/scenario_react_chatgpt_zeroshot.json',
    jq_schema='.[].diff',
    text_content=False)

test_diff_data = test_diff_loader.load()

with open('../data/scenario/scenario_react_chatgpt_zeroshot.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)

In [9]:
db = Chroma(persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_js", embedding_function=embeddings)
similar_diff = []
for i, data in tqdm(enumerate(test_diff_data[:100]), total=len(test_diff_data[:100]), desc="Processing documents"):
    documents = splitters[1].split_documents([data])
    candidate_ids = similarity_search_top_4(documents)
    sim_diff = []
    for candidate_id in candidate_ids:
        if candidate_id != -1:
            sim_diff.append(diff_data[candidate_id-1])
        else:
            sim_diff.append(None)
    similar_diff.append(sim_diff)

data = []

for sim_diff, test_diff in zip(similar_diff, test_diff_data[:100]):
    item = {}
    item['org_diff'] = test_diff.page_content
    for index, can_diff in enumerate(sim_diff):
        if can_diff is not None:
            item[f'sim_diff_{index}'] = can_diff.page_content
            item[f'sim_msg_{index}'] = msg_data[can_diff.metadata['seq_num']]
        else:
            item[f'sim_diff_{index}'] = None
            item[f'sim_msg_{index}'] = None
    data.append(item)
    
# Write the data to a JSON file
with open('../data/scenario/scenario_react_chatgpt_rag_top4_prompt.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [6]:

test_diff_loader = JSONLoader(
    file_path='../data/chronicle/rag_baseline/rag_baseline_js.json',
    jq_schema='.[].org_diff',
    text_content=False)

test_diff_data = test_diff_loader.load()

with open('../data/chronicle/rag_baseline/rag_baseline_js.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)

In [7]:
db = Chroma(persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_js", embedding_function=embeddings)
similar_diff = []
for i, data in tqdm(enumerate(test_diff_data), total=len(test_diff_data), desc="Processing documents"):
    documents = splitters[1].split_documents([data])
    candidate_ids = similarity_search_top_4(documents)
    sim_diff = []
    for candidate_id in candidate_ids:
        if candidate_id != -1:
            sim_diff.append(diff_data[candidate_id-1])
        else:
            sim_diff.append(None)
    similar_diff.append(sim_diff)

data = []

for sim_diff, test_diff in zip(similar_diff, test_diff_data):
    item = {}
    item['org_diff'] = test_diff.page_content
    for index, can_diff in enumerate(sim_diff):
        if can_diff is not None:
            item[f'sim_diff_{index}'] = can_diff.page_content
            item[f'sim_msg_{index}'] = msg_data[can_diff.metadata['seq_num']]
        else:
            item[f'sim_diff_{index}'] = None
            item[f'sim_msg_{index}'] = None
    data.append(item)
    
# Write the data to a JSON file
with open('../data/chronicle/rag_baseline/top4_rag/rag_baseline_top4_js.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)