In [1]:
from langchain_community.document_loaders import JSONLoader
from tqdm.auto import tqdm
import json
import pickle
from langchain.text_splitter import Language
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import glob
import os

## JS Part

In [5]:
folder_path = '../data/angular_filtered/subsets/type_db'
files = glob.glob(os.path.join(folder_path, '*db.json'))

'build'

In [2]:
# Define the path to the pre-trained model you want to use
# modelPath = "mixedbread-ai/mxbai-embed-large-v1"
# modelPath = "mchochlov/codebert-base-cd-ft"
# modelPath= "microsoft/unixcoder-base"
# modelPath ="codecompletedeployment/st-codesearch-distilroberta-base"
modelPath = "intfloat/e5-small-v2"
# modelPath = "sentence-transformers/all-MiniLM-L6-v2"
# modelPath = "../models/models-e5-v2-finetuned-10w-epoch20"

# for modelPath in ["mixedbread-ai/mxbai-embed-large-v1", "intfloat/e5-small-v2", "sentence-transformers/all-MiniLM-L6-v2"]:
model_kwargs = {'device':'cuda', 'trust_remote_code': True}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    cache_folder = '../models',
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [3]:
file = '../data/angular_filtered/subsets/db_data.json'
splitter = RecursiveCharacterTextSplitter.from_language(Language.JS, chunk_size=1000, chunk_overlap=200)
diff_loader = JSONLoader(
    file_path=file,
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

msg_loader = JSONLoader(
    file_path=file,
    jq_schema='.[].msg',
    text_content=False)

msg_data = msg_loader.load()
diff_split = []

for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
    diff_split += splitter.split_documents([doc])

type = file.split('\\')[-1].split('_')[1]
db = Chroma.from_documents(diff_split, embeddings, persist_directory=f"../data/angular_filtered/subsets/type_db/rag_all_types_db_e5")

Processing documents:   0%|          | 0/54598 [00:00<?, ?it/s]

In [13]:
splitter = RecursiveCharacterTextSplitter.from_language(Language.JS, chunk_size=1000, chunk_overlap=200)
for file in files:
    diff_loader = JSONLoader(
        file_path=file,
        jq_schema='.[].diff',
        text_content=False)
    
    diff_data = diff_loader.load()
    
    # msg_loader = JSONLoader(
    #     file_path=file,
    #     jq_schema='.[].msg',
    #     text_content=False)
    # 
    # msg_data = msg_loader.load()
    diff_split = []

    for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
        diff_split += splitter.split_documents([doc])
    
    type = file.split('\\')[-1].split('_')[1]
    db = Chroma.from_documents(diff_split, embeddings, persist_directory=f"../data/angular_filtered/subsets/type_db/rag_{type}_db_e5")

Processing documents:   0%|          | 0/1784 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/9000 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/1618 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/9000 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/9000 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/9000 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/513 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/7811 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/1157 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/5715 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/11000 [00:00<?, ?it/s]

In [16]:
# from langchain_openai import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings(api_key="sk-0rLvuRkMiD4Mw25QYygh6rUlZVjpQWNGNF4yez7z3PZ7yCOm", base_url="https://api.chatanywhere.cn/v1", model="text-embedding-3-small")

In [4]:
# Load the document, split it into chunks, embed each chunk and load it into the vector store.

# db = Chroma.from_documents(diff_data, embeddings, persist_directory="./final_js_rag_single_diff_db_300_distilroberta")

In [17]:
# db = Chroma(persist_directory="../data/angular_filtered/rag/rag_db_e5", embedding_function=embeddings)

In [4]:
test_diff_loader = JSONLoader(
    file_path='../data/angular_filtered/subsets/test_dev.json',
    jq_schema='.[].diff',
    text_content=False)

test_diff_data = test_diff_loader.load()
# test_diff_data = test_diff_loader.load()[:1000]

with open('../data/angular_filtered/subsets/test_dev.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)
    # test_data = json.load(f)[:1000]

In [8]:
import re
def process_diff(diff_text):
    # Replace <nl> with \n
    diff_text = diff_text.replace('<nl>', '\n')
    diff_text = re.sub(r'(?<=\S)\s*([^\w\s])\s*(?=\S)', r'\1', diff_text)
    diff_text = re.sub(r'\n\s+', r'\n', diff_text)
    diff_text = diff_text.replace('mmm', 'diff --git')
    diff_text = diff_text.replace('\nppp ', '')
    return diff_text

In [7]:
def similarity_search(documents, db):
    # Initialize an empty dictionary to store aggregate scores for each candidate ID
    aggregate_scores = {}

    # Iterate through each document in the documents list
    for document in documents:
        # Apply similarity search function to the document
        results = db.similarity_search_with_relevance_scores(document.page_content, score_threshold=0.0)
        # Iterate through the results for each document
        for candidate_doc in results:
            id = candidate_doc[0].metadata['seq_num']
            score = candidate_doc[1]
            # Update the aggregate score for the candidate ID
            aggregate_scores[id] = aggregate_scores.get(id, 0) + score

    # Find the candidate ID with the highest aggregate score
    if aggregate_scores:
        max_candidate_id = max(aggregate_scores, key=aggregate_scores.get)
    else:
        max_candidate_id = -1

    return max_candidate_id

In [8]:
# db = Chroma(persist_directory="./final_js_rag_single_diff_db_300", embedding_function=embeddings)
# retriever = db.as_retriever(search_kwargs={"k": 1})
similar_diff_id = []
for diff_doc in tqdm(test_diff_data, total=len(test_diff_data), desc="Processing documents"):
    documents = splitter.split_documents([diff_doc])
    # similar_diff.append(retriever.invoke(diff_doc.page_content)[0])
    similar_diff_id.append(similarity_search(documents, db))

data = []

for sim_diff_id, test_diff in zip(similar_diff_id, test_data):
    item = {'diff': test_diff['diff'],
            'msg': test_diff['msg'],
            'sim_msg': msg_data[sim_diff_id-1].page_content,
            'sim_diff': diff_data[sim_diff_id-1].page_content}
    data.append(item)
    
# Write the data to a JSON file
with open('../data/angular_filtered/subsets/generation/rag_prompt/dev_test_rag_prompt.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

Processing documents:   0%|          | 0/603 [00:00<?, ?it/s]

In [29]:
with open("../data/final_preprocessed_data/discriminator/dis_train_data_retrieve_openai_embedding.json", 'r', encoding='UTF-8') as f:
# with open("../data/final_preprocessed_data/js_baseline/js_baseline_test_data_300_golden.json", 'r', encoding='UTF-8') as f:
    golden_data = json.load(f)

In [33]:
retriever = db.as_retriever(search_kwargs={"k": 100})
sim_count = 0
for diff_doc, golden in tqdm(zip(test_diff_data,golden_data), total=len(test_diff_data), desc="Processing documents"):
    # documents = splitters[1].split_documents([diff_doc])
    for res in retriever.invoke(diff_doc.page_content):
        if golden['sim_diff']==res.page_content:
            sim_count+=1
            break

print(sim_count)

Processing documents:   0%|          | 0/500 [00:00<?, ?it/s]

10


In [16]:
golden_data[0]['sim_diff']

"diff --git a/test/custom-pages.js b/test/custom-pages.js @@ -31,6 +31,8 @@ describe('custom-pages', function() {\n}\n}\n});\n+\n+ assert(apos && apos.__meta.name === 'apostrophe');\n});\nit('should fire a dispatch route for its homepage', async function() {\n"

In [19]:

data = []

for sim_diff, test_diff in zip(similar_diff, test_data):
    item = {'org_diff': test_diff['diff'],
            'org_msg': test_diff['msg'],
            'sim_msg': msg_data[sim_diff.metadata['seq_num']-1].page_content,
            # 'sim_msg': msg_data[sim_diff-1].page_content,
            # 'sim_diff': diff_data[sim_diff-1].page_content}
            'sim_diff': diff_data[sim_diff.metadata['seq_num']-1].page_content}
    data.append(item)
    
# Write the data to a JSON file
with open('../data/final_preprocessed_data/discriminator/dis_train_data_retrieve_openai_embedding.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

## JS Part

In [3]:
diff_loader = JSONLoader(
    file_path='../data/final_preprocessed_data/js_rag_db_data_300.json',
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

# diff_split = []
# for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
#     diff_split += splitters[language_dict['JavaScript']].split_documents([doc])

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(diff_data, embeddings, persist_directory="./final_js_rag_single_diff_db_300")

In [4]:
diff_data[0]

# SIM_MSG

In [2]:
msg_loader = JSONLoader(
    file_path='../data/final_preprocessed_data/python_rag_db_data.json',
    jq_schema='.[].msg',
    text_content=False)

msg_data = msg_loader.load()

# Define the path to the pre-trained model you want to use
modelPath = "mixedbread-ai/mxbai-embed-large-v1"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cuda', 'trust_remote_code': True}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    cache_folder = '../models',
    
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [3]:
# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(msg_data, embeddings, persist_directory="./final_python_rag_single_diff_sim_msg_db")

In [5]:
msg_loader = JSONLoader(
    file_path='../data/final_preprocessed_data/js_rag_db_data.json',
    jq_schema='.[].msg',
    text_content=False)

msg_data = msg_loader.load()

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(msg_data, embeddings, persist_directory="./final_js_rag_single_diff_sim_msg_db")