In [1]:
from langchain_community.document_loaders import JSONLoader
from tqdm.auto import tqdm
import json
import pickle
from langchain.text_splitter import Language
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma


## Python Part

In [2]:
diff_loader = JSONLoader(
    file_path='../data/final_preprocessed_data/python_rag_db_data.json',
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

In [2]:
# 1-1000 data use java splitter, 1000-2000 use cpp splitter, 2000-3000 use csharp splitter, 3000-4000 use python splitter, 4000-5000 use javascript splitter


# languages = [Language.JAVA, Language.CPP, Language.CSHARP, Language.PYTHON, Language.JS]
languages = [Language.PYTHON, Language.JS]
splitters = [RecursiveCharacterTextSplitter.from_language(language, chunk_size=500, chunk_overlap=50) for language in languages]

# language_dict = {'java': 0, 'c++': 1, 'c#': 2, 'python': 3, 'javascript': 4}
language_dict = {'Python': 0, 'JavaScript': 1}

In [4]:
diff_split = []

for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
    diff_split += splitters[language_dict['Python']].split_documents([doc])

In [6]:
len(diff_split)

In [2]:
# Define the path to the pre-trained model you want to use
modelPath = "mixedbread-ai/mxbai-embed-large-v1"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cuda', 'trust_remote_code': True}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    cache_folder = '../models',
    
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [4]:
# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(diff_data, embeddings, persist_directory="./final_python_rag_single_diff_nochunk_db")

## JS Part

In [3]:
diff_loader = JSONLoader(
    file_path='../data/final_preprocessed_data/js_rag_db_data_300.json',
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

# diff_split = []
# for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
#     diff_split += splitters[language_dict['JavaScript']].split_documents([doc])

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(diff_data, embeddings, persist_directory="./final_js_rag_single_diff_db_300")

In [4]:
diff_data[0]

# SIM_MSG

In [2]:
msg_loader = JSONLoader(
    file_path='../data/final_preprocessed_data/python_rag_db_data.json',
    jq_schema='.[].msg',
    text_content=False)

msg_data = msg_loader.load()

# Define the path to the pre-trained model you want to use
modelPath = "mixedbread-ai/mxbai-embed-large-v1"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cuda', 'trust_remote_code': True}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    cache_folder = '../models',
    
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [3]:
# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(msg_data, embeddings, persist_directory="./final_python_rag_single_diff_sim_msg_db")

In [5]:
msg_loader = JSONLoader(
    file_path='../data/final_preprocessed_data/js_rag_db_data.json',
    jq_schema='.[].msg',
    text_content=False)

msg_data = msg_loader.load()

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(msg_data, embeddings, persist_directory="./final_js_rag_single_diff_sim_msg_db")