In [1]:
from langchain_community.document_loaders import JSONLoader
from tqdm.auto import tqdm
import json
import pickle
from langchain.text_splitter import Language
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

In [2]:
diff_loader = JSONLoader(
    file_path='../data/chronicle/chronicle_rag_db.json',
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

msg_loader = JSONLoader(
    file_path='../data/chronicle/chronicle_rag_db.json',
    jq_schema='.[].msg',
    text_content=False)

msg_data = msg_loader.load()

language_loader = JSONLoader(
    file_path='../data/chronicle/chronicle_rag_db.json',
    jq_schema='.[].lang',
    text_content=False)

language_data = language_loader.load()

In [3]:
# Define the path to the pre-trained model you want to use
modelPath = "mixedbread-ai/mxbai-embed-large-v1"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cuda', 'trust_remote_code': True}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    cache_folder = '../models',
    
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

# Python part

In [3]:
msg_python = []
for i, doc in tqdm(enumerate(msg_data), total=len(msg_data), desc="Processing documents"):
    if language_data[i].page_content=='Python':
        msg_python.append(doc)

In [6]:
# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(msg_python, embeddings, persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_msg_python")

In [7]:
db = Chroma(persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_msg_js", embedding_function=embeddings)

In [8]:
test_diff_loader = JSONLoader(
    file_path='../data/scenario/scenario_react_chatgpt_zeroshot.json',
    jq_schema='.[].chatgpt',
    text_content=False)

test_diff_data = test_diff_loader.load()

with open('../data/scenario/scenario_react_chatgpt_zeroshot.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)

In [9]:
# db = Chroma(persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_msg_python", embedding_function=embeddings)
similar_msg = []
retriever = db.as_retriever()
for i, data in tqdm(enumerate(test_diff_data[:100]), total=len(test_diff_data[:100]), desc="Processing documents"):
    similar_msg.append(retriever.invoke(data.page_content)[0])

data = []

for sim_msg, test_diff in zip(similar_msg, test_diff_data[:100]):
    item = {'org_diff': test_data[test_diff.metadata['seq_num']-1]['diff'],
            'org_msg': test_diff.page_content,
            'sim_msg': sim_msg.page_content,
            'sim_diff': diff_data[sim_msg.metadata['seq_num']-1].page_content,}
    data.append(item)
    
# Write the data to a JSON file
with open('../data/scenario/scenario_react_chatgpt_rag_sim_msg_prompt.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

# JavaScript part

In [10]:
msg_js = []
for i, doc in tqdm(enumerate(msg_data), total=len(msg_data), desc="Processing documents"):
    if language_data[i].page_content=='JavaScript':
        msg_js.append(doc)

In [11]:
# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(msg_js, embeddings, persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_msg_js")

In [8]:
db = Chroma(persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_msg_js", embedding_function=embeddings)

In [9]:
test_diff_loader = JSONLoader(
    file_path='../data/chronicle/rag_baseline/zeroshot/rag_baseline_js_chatgpt.json',
    jq_schema='.[].chatgpt',
    text_content=False)

test_diff_data = test_diff_loader.load()

with open('../data/chronicle/rag_baseline/zeroshot/rag_baseline_js_chatgpt.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)

In [10]:
# db = Chroma(persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_msg_python", embedding_function=embeddings)
similar_msg = []
retriever = db.as_retriever()
for i, data in tqdm(enumerate(test_diff_data), total=len(test_diff_data), desc="Processing documents"):
    similar_msg.append(retriever.invoke(data.page_content)[0])

data = []

for sim_msg, test_diff in zip(similar_msg, test_diff_data):
    item = {'org_diff': test_data[test_diff.metadata['seq_num']-1]['org_diff'],
            'org_msg': test_diff.page_content,
            'sim_msg': sim_msg.page_content,
            'sim_diff': diff_data[sim_msg.metadata['seq_num']-1].page_content,}
    data.append(item)
    
# Write the data to a JSON file
with open('../data/chronicle/rag_baseline/sim_msg_rag/rag_baseline_sim_msg_js.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)