In [1]:
!pip install langchain
!pip install jq
!pip install sentence-transformers

In [1]:
from langchain_community.document_loaders import JSONLoader
from tqdm.auto import tqdm
import json
import pickle
from langchain.text_splitter import Language
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma


In [3]:
with open("../data/chronicle/rag_msg.pkl", "rb") as f:
    msg_data = pickle.load(f)

In [2]:
diff_loader = JSONLoader(
    file_path='../data/final_preprocessed_data/js_rag_db_data_300.json',
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

In [3]:
msg_loader = JSONLoader(
    file_path='../data/final_preprocessed_data/js_rag_db_data_300.json',
    jq_schema='.[].msg',
    text_content=False)

msg_data = msg_loader.load()

In [4]:

language_loader = JSONLoader(
    file_path='../data/chronicle/chronicle_multi_diff_db_data.json',
    jq_schema='.[].lang',
    text_content=False)

language_data = language_loader.load()

In [16]:
# 1-1000 data use java splitter, 1000-2000 use cpp splitter, 2000-3000 use csharp splitter, 3000-4000 use python splitter, 4000-5000 use javascript splitter


# languages = [Language.JAVA, Language.CPP, Language.CSHARP, Language.PYTHON, Language.JS]
languages = [Language.PYTHON, Language.JS]
splitters = [RecursiveCharacterTextSplitter.from_language(language, chunk_size=500, chunk_overlap=50) for language in languages]

# language_dict = {'java': 0, 'c++': 1, 'c#': 2, 'python': 3, 'javascript': 4}
language_dict = {'Python': 0, 'JavaScript': 1}

In [4]:
def similarity_search(documents, db):
    # Initialize an empty dictionary to store aggregate scores for each candidate ID
    aggregate_scores = {}

    # Iterate through each document in the documents list
    for document in documents:
        # Apply similarity search function to the document
        results = db.similarity_search_with_relevance_scores(document.page_content, score_threshold=0.0)
        # Iterate through the results for each document
        for candidate_doc in results:
            id = candidate_doc[0].metadata['seq_num']
            score = candidate_doc[1]
            # Update the aggregate score for the candidate ID
            aggregate_scores[id] = aggregate_scores.get(id, 0) + score

    # Find the candidate ID with the highest aggregate score
    if aggregate_scores:
        max_candidate_id = max(aggregate_scores, key=aggregate_scores.get)
    else:
        max_candidate_id = -1

    return max_candidate_id

In [21]:
diff_split = []
for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
    if language_data[i].page_content=='Python':
        # diff_split.append(doc)
        diff_split += splitters[language_dict[language_data[i].page_content]].split_documents([doc])

In [2]:
# Define the path to the pre-trained model you want to use
modelPath = "mixedbread-ai/mxbai-embed-large-v1"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cuda', 'trust_remote_code': True}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    cache_folder = '../models',
    
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [35]:
res = embeddings.embed_query(diff_data[0].page_content)

In [24]:
# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(diff_split, embeddings, persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_python_multi_diff")

In [25]:
diff_split = []
for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
    if language_data[i].page_content=='JavaScript':
        # diff_split.append(doc)
        diff_split += splitters[language_dict[language_data[i].page_content]].split_documents([doc])

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(diff_split, embeddings, persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_js_multi_diff")

In [8]:
with open('../data/scenario/filtered_data/project_based_django_multi_diff_filtered.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)

In [24]:
with open('../data/scenario/filtered_data/project_based_django_multi_diff_filtered.json', 'w', encoding='UTF-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=4)

In [3]:
test_diff_loader = JSONLoader(
    file_path='../data/final_preprocessed_data/js_baseline_test_data_300.json',
    jq_schema='.[].diff',
    text_content=False)

test_diff_data = test_diff_loader.load()[:1000]

with open('../data/final_preprocessed_data/js_baseline_test_data_300.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)[:1000]

# Python part

In [4]:
db = Chroma(persist_directory="./final_js_rag_single_diff_db_300", embedding_function=embeddings)

In [8]:
test_diff_data[0].page_content

"diff --git a/test/Ownable.js b/test/Ownable.js @@ -38,6 +38,7 @@ contract('Ownable', function(accounts) {\nlet originalOwner = await ownable.owner();\ntry {\nawait ownable.transferOwnership(null, {from: originalOwner});\n+ assert.fail()\n} catch(error) {\nassertJump(error);\n}\n"

In [7]:
result = db.similarity_search_with_relevance_scores(test_diff_data[2].page_content)

In [8]:
result

[(Document(page_content='diff --git a/scripts/release.js b/scripts/release.js @@ -45,6 +45,7 @@ program\nwriteChangelog(program.changelog, repos)\n.then(() => repos);\n}\n+ return repos;\n})\n.then((repos) => {\nif (program.release && repos.length) {\n', metadata={'seq_num': 995, 'source': 'D:\\TU Delft\\thesis\\LLM_CMG\\llm4commit\\data\\final_preprocessed_data\\js_rag_db_data_300.json'}),
  0.8419218804857668),
 (Document(page_content='diff --git a/build/build.js b/build/build.js @@ -5,7 +5,7 @@ const fs = require("fs");\nconst path = require("path");\nconst pkg = require("../package.json");\nconst cwd = process.cwd();\n-const ENV_RE = /__ENV__/g;\n+const ENV_RE = /process\\.env\\.MOON_ENV/g;\nconst comment = `/**\n* Moon v${pkg.version}\n', metadata={'seq_num': 10483, 'source': 'D:\\TU Delft\\thesis\\LLM_CMG\\llm4commit\\data\\final_preprocessed_data\\js_rag_db_data_300.json'}),
  0.8402498728584911),
 (Document(page_content='diff --git a/scripts/release/release.js b/scripts/release

In [12]:
# db = Chroma(persist_directory="./final_js_rag_single_diff_db_300", embedding_function=embeddings)
retriever = db.as_retriever()
similar_diff = []
for diff_doc in tqdm(test_diff_data, total=len(test_diff_data), desc="Processing documents"):
    # documents = splitters[1].split_documents([diff_doc])
    similar_diff.append(retriever.invoke(diff_doc.page_content)[0])
    # similar_diff.append(similarity_search(documents, db))

Processing documents:   0%|          | 0/1000 [00:00<?, ?it/s]

In [14]:
similar_diff[0]

Document(page_content='diff --git a/protocols/delegate/test/Delegate-unit.js b/protocols/delegate/test/Delegate-unit.js @@ -181,7 +181,8 @@ contract(\'Delegate Unit Tests\', async accounts => {\nPRICE_COEF,\nEXP,\n{ from: notOwner }\n- )\n+ ),\n+ "Ownable: caller is not the owner"\n)\n})\n', metadata={'seq_num': 1960, 'source': 'D:\\TU Delft\\thesis\\LLM_CMG\\llm4commit\\data\\final_preprocessed_data\\js_rag_db_data_300.json'})

In [15]:

data = []

for sim_diff, test_diff in zip(similar_diff, test_data):
    item = {'org_diff': test_diff['diff'],
            'org_msg': test_diff['msg'],
            'sim_msg': msg_data[sim_diff.metadata['seq_num']-1].page_content,
            # 'sim_msg': msg_data[sim_diff-1].page_content,
            # 'sim_diff': diff_data[sim_diff-1].page_content}
            'sim_diff': diff_data[sim_diff.metadata['seq_num']-1].page_content}
    data.append(item)
    
# Write the data to a JSON file
with open('../data/final_preprocessed_data/js_baseline/js_baseline_rag_300_prompt.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [25]:
db = Chroma(persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_js_multi_diff", embedding_function=embeddings)
similar_diff = []
for diff_doc in tqdm(test_diff_data[:100], total=len(test_diff_data[:100]), desc="Processing documents"):
    documents = splitters[1].split_documents([diff_doc])
    similar_diff.append(similarity_search(documents, db))

data = []

for sim_diff, test_diff in zip(similar_diff, test_data[1000:2000]):
    item = {'org_diff': test_diff['diffs'],
            'org_msg': test_diff['msg'],
            'sim_msg': msg_data[sim_diff].page_content,
            'sim_diff': diff_data[sim_diff-1].page_content}
    data.append(item)
    
# Write the data to a JSON file
with open('../data/chronicle/multi-diff/multi_diff_rag_js_prompt.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(diff_split, embeddings, persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_python")

In [23]:
python_test_data = []
python_indices = []
for i, item in enumerate(test_data):
    if item['lang'] == 'Python':
        python_test_data.append(item)
        python_indices.append(i)

In [24]:
db = Chroma(persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_python", embedding_function=embeddings)
similar_diff = []
for index in tqdm(python_indices, total=len(python_indices), desc="Processing documents"):
    documents = splitters[0].split_documents([test_diff_data[index]])
    similar_diff.append(similarity_search(documents, db))

data = []

for sim_diff, test_diff in zip(similar_diff, python_test_data):
    item = {'org_diff': test_diff['diff'],
            'org_msg': test_diff['msg'],
            'sim_msg': msg_data[sim_diff],
            'sim_diff': diff_data[sim_diff-1].page_content}
    data.append(item)
    
# Write the data to a JSON file
with open('../data/chronicle/rag_baseline/rag_baseline_python.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

# JavaScript part

In [25]:
js_test_data = []
js_indices = []
for i, item in enumerate(test_data):
    if item['lang'] == 'JavaScript':
        js_test_data.append(item)
        js_indices.append(i)

In [1]:
db = Chroma(persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_js", embedding_function=embeddings)
similar_diff = []
for index in tqdm(js_indices, total=len(js_indices), desc="Processing documents"):
    documents = splitters[1].split_documents([test_diff_data[index]])
    similar_diff.append(similarity_search(documents, db))

data = []

for sim_diff, test_diff in zip(similar_diff, js_test_data):
    item = {'org_diff': test_diff['diff'],
            'org_msg': test_diff['msg'],
            'sim_msg': msg_data[sim_diff],
            'sim_diff': diff_data[sim_diff-1].page_content}
    data.append(item)
    
# Write the data to a JSON file
with open('../data/chronicle/rag_baseline/rag_baseline_js.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [16]:
diff_split = []
for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
    if language_data[i].page_content=='JavaScript':
        # diff_split.append(doc)
        diff_split += splitters[language_dict[language_data[i].page_content]].split_documents([doc])
# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(diff_split, embeddings, persist_directory="./chroma_chronicle_db_mxbai_500chunk_normalized_js")

In [56]:
similar_diff = []
for i, test_data in tqdm(enumerate(test_diff_data[:100]), total=len(test_diff_data[:100]), desc="Processing documents"):
    query = test_data.page_content
    similar_diff.append(diff_data[retriever.get_relevant_documents(query)[0].metadata['seq_num']-1])

data = []

for sim_diff, test_diff in zip(similar_diff, test_diff_data[:100]):
    item = {
        'sim_msg': msg_data[sim_diff.metadata['seq_num']],
        'sim_diff': sim_diff.page_content,
        'org_diff': test_diff.page_content
    }
    data.append(item)

# Write the data to a JSON file
with open('../data/chronicle/data_with_similar_diff_codebert.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [57]:
similar_diff = []
for i, test_data in tqdm(enumerate(test_diff_data[:100]), total=len(test_diff_data[:100]), desc="Processing documents"):
    query = test_data.page_content
    similar_diff.append(diff_data[retriever_2.get_relevant_documents(query)[0].metadata['seq_num']-1])

data = []

for sim_diff, test_diff in zip(similar_diff, test_diff_data[:100]):
    item = {
        'sim_msg': msg_data[sim_diff.metadata['seq_num']],
        'sim_diff': sim_diff.page_content,
        'org_diff': test_diff.page_content
    }
    data.append(item)

# Write the data to a JSON file
with open('../data/chronicle/data_with_similar_diff_mxbai.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [48]:
data = []

for sim_diff, test_diff in zip(similar_diff, test_diff_data):
    item = {
        'sim_msg': msg_data[sim_diff.metadata['seq_num']],
        'sim_diff': sim_diff.page_content,
        'org_diff': test_diff.page_content
    }
    data.append(item)

In [49]:
# Write the data to a JSON file
with open('../data/chronicle/data_with_similar_diff_codebert.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)