In [None]:
!pip install langchain
!pip install jq
!pip install sentence-transformers
!pip install chromadb

In [1]:
from langchain_community.document_loaders import JSONLoader

In [2]:
diff_loader = JSONLoader(
    file_path='../data/rag_data.json',
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

msg_loader = JSONLoader(
    file_path='../data/rag_data.json',
    jq_schema='.[].msg',
    text_content=False)

msg_data = msg_loader.load()

language_loader = JSONLoader(
    file_path='../data/rag_data.json',
    jq_schema='.[].language',
    text_content=False)

language_data = language_loader.load()

In [3]:
# 1-1000 data use java splitter, 1000-2000 use cpp splitter, 2000-3000 use csharp splitter, 3000-4000 use python splitter, 4000-5000 use javascript splitter
from langchain.text_splitter import Language
from langchain.text_splitter import RecursiveCharacterTextSplitter

languages = [Language.JAVA, Language.CPP, Language.CSHARP, Language.PYTHON, Language.JS]
splitters = [RecursiveCharacterTextSplitter.from_language(language, chunk_size=300, chunk_overlap=0) for language in languages]

language_dict = {'java': 0, 'cpp': 1, 'csharp': 2, 'python': 3, 'javascript': 4}

In [4]:
from tqdm import tqdm
diff_split = []
for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
    diff_split += splitters[language_dict[language_data[i].page_content]].split_documents([doc])

Processing documents: 100%|██████████| 500000/500000 [01:08<00:00, 7344.55it/s]


In [5]:
len(diff_split)

2774108

In [10]:
diff_split[0]

Document(page_content='diff --git a/butterknife-runtime/src/main/java/butterknife/internal/Utils.java \nppp b/butterknife-runtime/src/main/java/butterknife/internal/Utils.java', metadata={'source': 'D:\\TU Delft\\thesis\\LLM_CMG\\llm4commit\\data\\rag_data.json', 'seq_num': 1})

In [4]:
from langchain.embeddings import HuggingFaceEmbeddings

In [5]:
# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cuda'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [6]:
from langchain_community.vectorstores import Chroma

In [7]:
db = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)

In [9]:

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(diff_data, embeddings, persist_directory="./chroma_db")

In [12]:
db.persist()

In [8]:
test_diff_loader = JSONLoader(
    file_path='../data/msg_nngen_nmt_codebert_chatgpt.json',
    jq_schema='.[].diff',
    text_content=False)

test_diff_data = test_diff_loader.load()

In [31]:
def similarity_search(documents):
    # Initialize an empty dictionary to store aggregate scores for each candidate ID
    aggregate_scores = {}
    
    # Iterate through each document in the documents list
    for document in documents:
        # Apply similarity search function to the document
        results = db.similarity_search_with_relevance_scores(document.page_content, score_threshold=0.0)
        # Iterate through the results for each document
        for candidate_doc in results:
            id = candidate_doc[0].metadata['seq_num']
            score = candidate_doc[1]
            # Update the aggregate score for the candidate ID
            aggregate_scores[id] = aggregate_scores.get(id, 0) + score
    
    # Find the candidate ID with the highest aggregate score
    max_candidate_id = max(aggregate_scores, key=aggregate_scores.get)
    return max_candidate_id - 1

In [32]:
from tqdm import tqdm

In [33]:
similar_diff = []
for i, test_data in tqdm(enumerate(test_diff_data), total=len(test_diff_data), desc="Processing documents"):
    query = test_data.page_content
    documents = splitters[i//1000].create_documents([query])
    similar_diff.append(diff_data[similarity_search(documents)])

Processing documents: 100%|██████████| 5000/5000 [05:02<00:00, 16.53it/s]


In [34]:
len(similar_diff)

5000

In [40]:
msg_data[similar_diff[0].metadata['seq_num']-1].page_content

'Use an actual Epsilon here'

In [45]:
import json
data = []

for sim_diff, test_diff in zip(similar_diff, test_diff_data):
    item = {
        'sim_msg': msg_data[sim_diff.metadata['seq_num']-1].page_content,
        'sim_diff': sim_diff.page_content,
        'org_diff': test_diff.page_content
    }
    data.append(item)

In [47]:
# Write the data to a JSON file
with open('../data/test_data_with_rag.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [48]:
retriever = db.as_retriever()

langchain_core.vectorstores.VectorStoreRetriever

In [50]:
query =test_diff_data[0].page_content
documents = splitters[0].create_documents([query])

In [51]:
docs = retriever.get_relevant_documents(documents[0].page_content)

In [56]:
docs[1]

Document(page_content='diff --git a/MPChartLib/src/main/java/com/github/mikephil/charting/renderer/BubbleChartRenderer.java \nppp b/MPChartLib/src/main/java/com/github/mikephil/charting/renderer/BubbleChartRenderer.java \nprotected void drawDataSet(Canvas c,IBubbleDataSet dataSet){ \nif(! mViewPortHandler.isInBoundsRight(pointBuffer[0]- shapeHalf)) \nbreak;- final int color=dataSet.getColor(( int)entry.getX());+final int color=dataSet.getColor(j); \nmRenderPaint.setColor(color); \nc.drawCircle(pointBuffer[0], pointBuffer[1], shapeHalf,mRenderPaint); \n', metadata={'seq_num': 27766, 'source': 'D:\\TU Delft\\thesis\\LLM_CMG\\llm4commit\\data\\rag_data.json'})

In [57]:
db.similarity_search_with_relevance_scores(documents[0].page_content)[1]

(Document(page_content='diff --git a/MPChartLib/src/main/java/com/github/mikephil/charting/renderer/BubbleChartRenderer.java \nppp b/MPChartLib/src/main/java/com/github/mikephil/charting/renderer/BubbleChartRenderer.java \nprotected void drawDataSet(Canvas c,IBubbleDataSet dataSet){ \nif(! mViewPortHandler.isInBoundsRight(pointBuffer[0]- shapeHalf)) \nbreak;- final int color=dataSet.getColor(( int)entry.getX());+final int color=dataSet.getColor(j); \nmRenderPaint.setColor(color); \nc.drawCircle(pointBuffer[0], pointBuffer[1], shapeHalf,mRenderPaint); \n', metadata={'seq_num': 27766, 'source': 'D:\\TU Delft\\thesis\\LLM_CMG\\llm4commit\\data\\rag_data.json'}),
 0.736459085862412)

In [72]:
from langchain import hub

prompt = hub.pull("tyfann/llm4commit-rag")

In [73]:

example_messages

[HumanMessage(content='You are an assistant for commit message generation tasks. Use the following pieces of retrieved-context, learn how humans write commit messages and generate the commit message for the question. Keep the answer within 30 words.\nQuestion: diff --git a/sharding-jdbc-orchestration/src/main/java/io/shardingjdbc/orchestration/yaml/YamlOrchestrationConfiguration.java  b/sharding-jdbc-orchestration/src/main/java/io/shardingjdbc/orchestration/yaml/YamlOrchestrationConfiguration.java*@ return orchestration master-slave rule configuration from yaml*/ \npublic OrchestrationConfiguration getOrchestrationConfiguration(){+ if(null!= etcd&& null!= zookeeper){+throw new RuntimeException(" Can\'t config both zookeeper and etcd as registry center!");+} \nreturn new OrchestrationConfiguration(getName(),null!= etcd?etcd:zookeeper,overwrite);}} \n\nContext: diff --git a/sharding-jdbc-orchestration/src/main/java/io/shardingsphere/jdbc/orchestration/internal/OrchestrationProxyConfigura

In [81]:
from openai import OpenAI

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key="sk-0rLvuRkMiD4Mw25QYygh6rUlZVjpQWNGNF4yez7z3PZ7yCOm",
    # api_key="sk-tMbkq3K1iO5vf0FRMlrmzslGXJZwE0us3mve4QXuvpnZcumG",
    base_url="https://api.chatanywhere.tech/v1"
    # base_url="https://api.chatanywhere.cn/v1"
)

In [83]:
def gpt_35_api(messages: list):
    """为提供的对话消息创建新的回答

    Args:
        messages (list): 完整的对话消息
    """
    completion = client.chat.completions.create(model="gpt-3.5-turbo-0125", messages=messages, temperature=0.5)
    return completion.choices[0].message.content

In [84]:
gpt_msg = []
for sim_diff, test_diff in tqdm(zip(similar_diff, test_diff_data), total=len(test_diff_data), desc="Processing documents"):
    messages = prompt.invoke(
        {"context": sim_diff.page_content + "\nCommit message:"+ msg_data[sim_diff.metadata['seq_num']-1].page_content, "question": test_diff.page_content}
    ).to_messages()
    example_prompt = [{'role': 'user','content': messages[0].content},]
    gpt_msg.append(gpt_35_api(example_prompt))

Processing documents: 100%|██████████| 5000/5000 [1:46:31<00:00,  1.28s/it]  


In [87]:
with open('../data/msg_nngen_nmt_codebert_chatgpt.json', 'r', encoding='UTF-8') as f:
    org_data = json.load(f)

for item, msg in zip(org_data, gpt_msg):
    item['chatgpt_rag'] = msg

output_file = '../data/msg_nngen_nmt_codebert_chatgpt_rag.json'
with open(output_file, 'w', encoding='UTF-8') as f:
    json.dump(org_data, f, ensure_ascii=False, indent=4)

In [88]:
len(gpt_msg)

5000