In [5]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import JSONLoader
from langchain.embeddings import HuggingFaceEmbeddings
import json

In [6]:
diff_loader = JSONLoader(
    file_path='../data/final_preprocessed_data/js_rag_db_data_300.json',
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

In [3]:
# Define the path to the pre-trained model you want to use
modelPath = "mixedbread-ai/mxbai-embed-large-v1"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cuda', 'trust_remote_code': True}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    cache_folder = '../models',
    
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [7]:
db = FAISS.from_documents(diff_data, embeddings)

In [8]:
db.save_local('./faiss_final_js_rag_single_diff_db_300')

In [10]:
db = FAISS.load_local('./faiss_final_js_rag_single_diff_db_300', embeddings=embeddings, allow_dangerous_deserialization=True)

In [13]:
test_diff_loader = JSONLoader(
    file_path='../data/final_preprocessed_data/js_baseline_test_data_300.json',
    jq_schema='.[].diff',
    text_content=False)

test_diff_data = test_diff_loader.load()[:1000]

with open('../data/final_preprocessed_data/js_baseline_test_data_300.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)[:1000]

In [15]:
db.similarity_search_with_relevance_scores(test_diff_data[0].page_content)

[(Document(page_content='diff --git a/protocols/delegate/test/Delegate-unit.js b/protocols/delegate/test/Delegate-unit.js @@ -181,7 +181,8 @@ contract(\'Delegate Unit Tests\', async accounts => {\nPRICE_COEF,\nEXP,\n{ from: notOwner }\n- )\n+ ),\n+ "Ownable: caller is not the owner"\n)\n})\n', metadata={'source': 'D:\\TU Delft\\thesis\\LLM_CMG\\llm4commit\\data\\final_preprocessed_data\\js_rag_db_data_300.json', 'seq_num': 1960}),
  0.8124383897847589),
 (Document(page_content="diff --git a/tests/unit/utils.test.js b/tests/unit/utils.test.js @@ -66,7 +66,7 @@ suite('utils', function() {\nvar owner = 'test';\nvar entity = {\ncomponents: {\n- 'networked-remote' : { data: { owner: owner}}\n+ 'networked' : { data: { owner: owner}}\n}\n};\n", metadata={'source': 'D:\\TU Delft\\thesis\\LLM_CMG\\llm4commit\\data\\final_preprocessed_data\\js_rag_db_data_300.json', 'seq_num': 8875}),
  0.8062819574759241),
 (Document(page_content="diff --git a/examples/peer/test/Peer.js b/examples/peer/test/Pee