In [13]:
from langchain_community.document_loaders import JSONLoader
from tqdm.auto import tqdm
import json
import pickle
from langchain.text_splitter import Language
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import glob
import os
from collections import defaultdict

## JS Part

In [14]:
folder_path = '../data/vdo_filtered/type_db'
files = glob.glob(os.path.join(folder_path, '*db.json'))

In [15]:
# Define the path to the pre-trained model you want to use
# modelPath = "mixedbread-ai/mxbai-embed-large-v1"
# modelPath = "mchochlov/codebert-base-cd-ft"
# modelPath= "microsoft/unixcoder-base"
# modelPath ="codecompletedeployment/st-codesearch-distilroberta-base"
modelPath = "intfloat/e5-small-v2"
# modelPath = "sentence-transformers/all-MiniLM-L6-v2"
# modelPath = "../models/models-e5-v2-finetuned-10w-epoch20"

# for modelPath in ["mixedbread-ai/mxbai-embed-large-v1", "intfloat/e5-small-v2", "sentence-transformers/all-MiniLM-L6-v2"]:
model_kwargs = {'device':'cuda', 'trust_remote_code': True}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    cache_folder = '../models',
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [14]:
def similarity_search(documents, db):
    # Initialize an empty dictionary to store aggregate scores for each candidate ID
    aggregate_scores = {}

    # Iterate through each document in the documents list
    for document in documents:
        # Apply similarity search function to the document
        results = db.similarity_search_with_relevance_scores(document.page_content, score_threshold=0.0)
        # Iterate through the results for each document
        for candidate_doc in results:
            id = candidate_doc[0].metadata['seq_num']
            score = candidate_doc[1]
            # Update the aggregate score for the candidate ID
            aggregate_scores[id] = aggregate_scores.get(id, 0) + score

    # Find the candidate ID with the highest aggregate score
    if aggregate_scores:
        max_candidate_id = max(aggregate_scores, key=aggregate_scores.get)
    else:
        max_candidate_id = -1

    return max_candidate_id

In [19]:
test_diff_loader = JSONLoader(
    file_path='../data/vdo_filtered/classification/dev_test_with_classification.json',
    jq_schema='.[].diff',
    text_content=False)

test_diff_data = test_diff_loader.load()
# test_diff_data = test_diff_loader.load()[:1000]

with open('../data/vdo_filtered/classification/dev_test_with_classification.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)

## Retrieve diff by the type

In [26]:
grouped_data = defaultdict(list)
for entry in test_data:
    grouped_data[entry['classifier_type']].append(entry)
# splitter = RecursiveCharacterTextSplitter.from_language(Language.JS, chunk_size=1000, chunk_overlap=200)
for file in files:
    diff_loader = JSONLoader(
        file_path=file,
        jq_schema='.[].diff',
        text_content=False)
    
    diff_data = diff_loader.load()
    
    msg_loader = JSONLoader(
        file_path=file,
        jq_schema='.[].msg',
        text_content=False)

    msg_data = msg_loader.load()
    # diff_split = []

    # for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
    #     diff_split += splitter.split_documents([doc])
    
    type = file.split('\\')[-1].split('_')[0]
    if os.path.exists(f"../data/vdo_filtered/type_db/rag_{type}_db_e5"):
        db = Chroma(persist_directory=f"../data/vdo_filtered/type_db/rag_{type}_db_e5", embedding_function=embeddings)
    else:
        db = Chroma.from_documents(diff_data, embeddings, persist_directory=f"../data/vdo_filtered/type_db/rag_{type}_db_e5")
    # db = Chroma(persist_directory=f"../data/angular_filtered/subsets/type_db/rag_{type}_db_e5", embedding_function=embeddings)
    similar_diff = []
    retriever = db.as_retriever()
    indexs = [item['seq_num'] for item in grouped_data[type]]
    for index in tqdm(indexs, total=len(indexs), desc="Processing documents"):
        # documents = splitter.split_documents([test_diff_data[index]])
        # similar_diff.append(retriever.invoke(diff_doc.page_content)[0])
        # similar_diff_id.append(similarity_search(documents, db))
        similar_diff.append(retriever.invoke(test_diff_data[index].page_content)[0])
    
    for sim_diff, test_diff in zip(similar_diff, grouped_data[type]):
        test_diff['sim_msg'] = msg_data[sim_diff.metadata['seq_num']-1].page_content
        test_diff['sim_diff'] = diff_data[sim_diff.metadata['seq_num']-1].page_content

Processing documents:   0%|          | 0/74 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/52 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/58 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/26 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/15 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/80 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/5 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/76 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/33 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/24 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/7 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/43 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/56 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/17 [00:00<?, ?it/s]

In [27]:
values_list = [value for values in grouped_data.values() for value in values]
sorted_items = sorted(values_list, key=lambda x: x['seq_num'])

In [28]:

# Write the data to a JSON file
with open('../data/vdo_filtered/generation/rag/dev_test_model_classified_rag_prompt.json', 'w', encoding='UTF-8') as f:
    json.dump(sorted_items, f, ensure_ascii=False, indent=4)

In [21]:
version = 'e5'
with open(f'../data/angular_filtered/subsets/generation/dev_test_gpt35_model_classified_rag_{version}.json', 'r', encoding='UTF-8') as f:
    model_data = json.load(f)

model_data_sorted = sorted(model_data, key=lambda x: x['seq_num'])
with open(f'../data/angular_filtered/subsets/generation/dev_test_gpt35_model_classified_rag_{version}.json', 'w', encoding='UTF-8') as f:
    json.dump(model_data_sorted, f, ensure_ascii=False, indent=4)

with open(f'../data/angular_filtered/subsets/generation/dev_test_gpt35_model_classified_rag_{version}.txt', 'w', encoding='UTF-8') as f:
    for item in model_data_sorted:
        f.write(item['chatgpt_rag'].replace('\n', '\\n').replace('\r', '\\r') + '\n')

In [20]:
from collections import defaultdict

# 示例 defaultdict，里面有10个key，每个key对应一个数组
example_dict = defaultdict(list, {
    'type1': [{'seq_num': 2, 'data': 'a'}, {'seq_num': 1, 'data': 'b'}],
    'type2': [{'seq_num': 4, 'data': 'c'}, {'seq_num': 3, 'data': 'd'}],
    # 其他类型的数据...
})

# 定义一个空数组，用于存储排序后的数据
merged_data = []

# 将 defaultdict 里的所有条目按 seq_num 排序后追加到 merged_data 中
for key, items in example_dict.items():
    sorted_items = sorted(items, key=lambda x: x['seq_num'])
    merged_data.extend(sorted_items)

# 输出结果
print(merged_data)


[{'seq_num': 1, 'data': 'b'}, {'seq_num': 2, 'data': 'a'}, {'seq_num': 3, 'data': 'd'}, {'seq_num': 4, 'data': 'c'}]


In [None]:
file = '../data/angular_filtered/subsets/db_data.json'
splitter = RecursiveCharacterTextSplitter.from_language(Language.JS, chunk_size=1000, chunk_overlap=200)
diff_loader = JSONLoader(
    file_path=file,
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

msg_loader = JSONLoader(
    file_path=file,
    jq_schema='.[].msg',
    text_content=False)

msg_data = msg_loader.load()
diff_split = []

for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
    diff_split += splitter.split_documents([doc])

type = file.split('\\')[-1].split('_')[1]
db = Chroma.from_documents(diff_split, embeddings, persist_directory=f"../data/angular_filtered/subsets/type_db/rag_all_types_db_e5")

In [4]:
# Load the document, split it into chunks, embed each chunk and load it into the vector store.

# db = Chroma.from_documents(diff_data, embeddings, persist_directory="./final_js_rag_single_diff_db_300_distilroberta")

In [17]:
# db = Chroma(persist_directory="../data/angular_filtered/rag/rag_db_e5", embedding_function=embeddings)

In [8]:
import re
def process_diff(diff_text):
    # Replace <nl> with \n
    diff_text = diff_text.replace('<nl>', '\n')
    diff_text = re.sub(r'(?<=\S)\s*([^\w\s])\s*(?=\S)', r'\1', diff_text)
    diff_text = re.sub(r'\n\s+', r'\n', diff_text)
    diff_text = diff_text.replace('mmm', 'diff --git')
    diff_text = diff_text.replace('\nppp ', '')
    return diff_text

## all types Part

In [4]:
diff_loader = JSONLoader(
    file_path='../data/vdo_filtered/db_data.json',
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

msg_loader = JSONLoader(
    file_path='../data/vdo_filtered/db_data.json',
    jq_schema='.[].msg',
    text_content=False)

msg_data = msg_loader.load()

In [7]:
test_diff_loader = JSONLoader(
    file_path='../data/vdo_filtered/dev_test_data.json',
    jq_schema='.[].diff',
    text_content=False)

test_diff_data = test_diff_loader.load()
# test_diff_data = test_diff_loader.load()[:1000]

with open('../data/vdo_filtered/dev_test_data.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)

In [5]:
# diff_split = []
# splitter = RecursiveCharacterTextSplitter.from_language(Language.JS, chunk_size=1000, chunk_overlap=200)
# for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
#     diff_split += splitter.split_documents([doc])
db = Chroma.from_documents(diff_data, embeddings, persist_directory="../data/vdo_filtered/type_db/rag_all_types_db_e5_nochunk")

# db = Chroma(persist_directory="../data/angular_filtered/subsets/type_db/rag_all_types_db_e5", embedding_function=embeddings)

In [8]:
similar_diff = []
retriever = db.as_retriever()
for diff_doc in tqdm(test_diff_data, total=len(test_diff_data), desc="Processing documents"):
    # documents = splitter.split_documents([diff_doc])
    similar_diff.append(retriever.invoke(diff_doc.page_content)[0])

Processing documents:   0%|          | 0/567 [00:00<?, ?it/s]

In [10]:
for sim_diff, test_item in zip(similar_diff, test_data):
    test_item['sim_msg'] = msg_data[sim_diff.metadata['seq_num']-1].page_content
    test_item['sim_diff'] = diff_data[sim_diff.metadata['seq_num']-1].page_content

In [11]:
# Write the data to a JSON file
with open('../data/vdo_filtered/generation/rag/dev_test_rag_prompt.json', 'w', encoding='UTF-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=4)

In [12]:
count = 0
for item in test_data:
    msg = item['msg']
    first_word = msg.split()[0] if msg else ''
    
    sim_msg = item['sim_msg']
    sim_first_word = sim_msg.split()[0] if sim_msg else ''
    if first_word == sim_first_word:
        count +=1

print(count / len(test_data))

0.2698412698412698
