# Angular format data

## Build vanilla RAG system

In [5]:
from langchain_community.document_loaders import JSONLoader
from tqdm.auto import tqdm
import json
import pickle
from langchain.text_splitter import Language
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import glob
import os
from collections import defaultdict

In [6]:
# Define the path to the pre-trained model you want to use
# modelPath = "mixedbread-ai/mxbai-embed-large-v1"
# modelPath = "mchochlov/codebert-base-cd-ft"
# modelPath= "microsoft/unixcoder-base"
# modelPath ="codecompletedeployment/st-codesearch-distilroberta-base"
modelPath = "intfloat/e5-small-v2"
# modelPath = "sentence-transformers/all-MiniLM-L6-v2"
# modelPath = "../models/models-e5-v2-finetuned-10w-epoch20"

# for modelPath in ["mixedbread-ai/mxbai-embed-large-v1", "intfloat/e5-small-v2", "sentence-transformers/all-MiniLM-L6-v2"]:
model_kwargs = {'device':'cuda', 'trust_remote_code': True}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    cache_folder = '../models',
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [3]:
diff_loader = JSONLoader(
    file_path='../data/angular_filtered/subsets/db_data.json',
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

msg_loader = JSONLoader(
    file_path='../data/angular_filtered/subsets/db_data.json',
    jq_schema='.[].msg',
    text_content=False)

msg_data = msg_loader.load()

In [4]:
db = Chroma.from_documents(diff_data, embeddings, persist_directory="../data/angular_filtered/subsets/type_db/rag_all_types_db_e5_nochunk")

In [5]:
test_diff_loader = JSONLoader(
    file_path='../data/angular_filtered/subsets/test_data.json',
    jq_schema='.[].diff',
    text_content=False)

test_diff_data = test_diff_loader.load()
# test_diff_data = test_diff_loader.load()[:1000]

with open('../data/angular_filtered/subsets/test_data.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)

In [6]:
similar_diff = []
retriever = db.as_retriever()
for diff_doc in tqdm(test_diff_data, total=len(test_diff_data), desc="Processing documents"):
    similar_diff.append(retriever.invoke(diff_doc.page_content)[0])

for sim_diff, test_item in zip(similar_diff, test_data):
    test_item['sim_msg'] = msg_data[sim_diff.metadata['seq_num']-1].page_content
    test_item['sim_diff'] = diff_data[sim_diff.metadata['seq_num']-1].page_content

with open('../data/angular_filtered/subsets/generation/rag/test_rag_prompt.json', 'w', encoding='UTF-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=4)

Processing documents:   0%|          | 0/1831 [00:00<?, ?it/s]

In [7]:
from openai import OpenAI

client = OpenAI(
    api_key="sk-0rLvuRkMiD4Mw25QYygh6rUlZVjpQWNGNF4yez7z3PZ7yCOm",
    base_url="https://api.chatanywhere.cn/v1"
)

def gpt_35_api(messages: list):

    completion = client.chat.completions.create(model="gpt-3.5-turbo-0125", messages=messages, temperature=0)
    return completion.choices[0].message.content

import json
from tqdm import tqdm
from langchain import hub
prompt = hub.pull("tyfann/llm4commit-rag:b843ef0b")


In [8]:
with open(f'../data/angular_filtered/subsets/generation/rag/test_rag_prompt.json', 'r', encoding='UTF-8') as f:
    org_data = json.load(f)
gpt_msg = []
for index, data in tqdm(enumerate(org_data), total=len(org_data), desc="Processing documents"):
    # merged_diff = '\n'.join(diff['diff'] for diff in data['diff'])
    messages = prompt.invoke(
        {"context": data['sim_diff'], "msg": data['sim_msg'], "diff": data['diff']}
    ).to_messages()
    example_prompt = [{'role': 'user','content': messages[0].content},]
    try:
        gpt_msg.append(gpt_35_api(example_prompt))
    except:
        print(index)
        gpt_msg.append('')
    # gpt_msg.append(gpt_35_api(example_prompt))

for item, msg in zip(org_data, gpt_msg):
    item['chatgpt_rag'] = msg

output_file = f'../data/angular_filtered/subsets/generation/test_gpt35_rag.json'
with open(output_file, 'w', encoding='UTF-8') as f:
    json.dump(org_data, f, ensure_ascii=False, indent=4)
    
output_file = f'../data/angular_filtered/subsets/generation/test_gpt35_rag.txt'
with open(output_file, 'w', encoding='UTF-8') as f:
    for item in org_data:
        f.write(item['chatgpt_rag'].replace('\n', '\\n').replace('\r', '\\r') + '\n')

Processing documents: 100%|██████████| 1831/1831 [23:27<00:00,  1.30it/s]


## Build classified RAG system

In [9]:
types = [
    'build', 'ci', 'docs', 'feat', 'fix', 
    'perf', 'refactor', 'style', 'test', 'chore'
]
type_label_mapping = {type_name: idx for idx, type_name in enumerate(types)}
output_dir = '../data/angular_filtered/subsets/classification'
# Save the type-label mapping to a txt file
with open(os.path.join(output_dir, 'type_label_mapping.txt'), 'w') as mapping_file:
    for type_name, label in type_label_mapping.items():
        mapping_file.write(f'{type_name}: {label}\n')

In [10]:
import json
import random

# 读取mapping.txt文件
def read_mapping(file_path):
    mapping = {}
    with open(file_path, 'r') as f:
        for line in f:
            key, value = line.strip().split(': ')
            mapping[key] = int(value)
    return mapping

# 读取json文件
def read_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

# 写入jsonl文件
def write_jsonl(file_path, data):
    with open(file_path, 'w') as f:
        for item in data:
            json.dump(item, f)
            f.write('\n')

# 主函数
def main(mapping_file, json_file, output_file):
    # 读取mapping和json数据
    mapping = read_mapping(mapping_file)
    json_data = read_json(json_file)
    # random.seed(42)
    # random.shuffle(json_data)

    # 生成新的jsonl数据
    new_data = []
    for item in json_data:
        new_item = {
            "code": item["diff"],
            "label": mapping[item["type"]]
        }
        new_data.append(new_item)

    # 写入新的jsonl文件
    write_jsonl(output_file, new_data)

# 调用主函数
mapping_file = '../data/angular_filtered/subsets/classification/type_label_mapping.txt'
json_file = '../data/angular_filtered/subsets/test_data.json'
output_file = '../../CodeBERT-classification/dataset/test_angular.jsonl'
main(mapping_file, json_file, output_file)

In [11]:
import json
import glob
import os
from sklearn.model_selection import train_test_split
import random
random.seed(42)

# Define the directory containing the JSON files
output_dir = '../../CodeBERT-classification/dataset'

# List of types and their corresponding labels
types = [
    'build', 'ci', 'docs', 'feat', 'fix', 
    'perf', 'refactor', 'style', 'test', 'chore'
]
type_label_mapping = {type_name: idx for idx, type_name in enumerate(types)}

train_items = []
valid_items = []

# Read and process all test JSON files
for type_name in types:
    test_file = os.path.join('../data/angular_filtered/subsets/type_db', f'JavaScript_{type_name}_db.json')
    if os.path.exists(test_file):
        with open(test_file, 'r') as f:
            items = json.load(f)
            data = [
                {
                    "code": item['diff'],
                    "label": type_label_mapping[type_name]
                }
                for item in items
            ]
            # Split data into 1:8 ratio
            train_split, valid_split = train_test_split(data, test_size=1/10, random_state=42)
            valid_items.extend(valid_split)
            train_items.extend(train_split)

# Shuffle the items to randomize their order
random.shuffle(valid_items)
random.shuffle(train_items)

# Save validation items to valid.jsonl
with open(os.path.join(output_dir, 'valid_angular.jsonl'), 'w') as valid_file:
    for item in valid_items:
        valid_file.write(json.dumps(item) + '\n')

# Save training items to train.jsonl
with open(os.path.join(output_dir, 'train_angular.jsonl'), 'w') as train_file:
    for item in train_items:
        train_file.write(json.dumps(item) + '\n')

print("Data splitting and saving to JSONL files completed.")


Data splitting and saving to JSONL files completed.


calculate vanilla rag accuracy

In [1]:
import re
import json
def get_commit_type(msg, has_space=True):
    types = '((build)|(ci)|(docs)|(feat)|(fix)|(perf)|(refactor)|(style)|(test)|(chore))'
    if has_space:
        pattern = f'^{types}\\s*(\\((\\s|\\S)+\\))?:\\s*(\\s|\\S)+'
    else:
        pattern = f'^{types}(\\((\\s|\\S)+\\))?:\\s*\\S+(\\s|\\S)+'
    match = re.match(pattern, msg)
    if match:
        return match.group(1)
    return None

with open('../data/angular_filtered/subsets/generation/rag/test_rag_prompt.json', 'r', encoding='UTF-8') as f:
    prompt = json.load(f)

count = 0
for item in prompt:
    sim_type = get_commit_type(item['sim_msg'])
    if sim_type is not None:
        if sim_type == item['type']:
            count +=1
    else:
        print('fail')

print(count / len(prompt))

classification result to json

In [3]:
import json

# 读取mapping.txt文件
def read_mapping(file_path):
    mapping = {}
    with open(file_path, 'r', encoding='UTF-8') as f:
        for line in f:
            key, value = line.strip().split(': ')
            mapping[int(value)] = key  # 将value转换为int作为键，key作为值
    return mapping

# 读取json文件
def read_json(file_path):
    with open(file_path, 'r', encoding='UTF-8') as f:
        data = json.load(f)
    return data

# 读取pred.txt文件
def read_labels(file_path):
    with open(file_path, 'r', encoding='UTF-8') as f:
        labels = [int(line.strip()) for line in f]
    return labels

# 写入json文件
def write_json(file_path, data):
    with open(file_path, 'w', encoding='UTF-8') as f:
        json.dump(data, f, indent=4)

# 主函数
def main(mapping_file, json_file, pred_file, output_file):
    # 读取mapping和json数据
    mapping = read_mapping(mapping_file)
    json_data = read_json(json_file)
    pred_labels = read_labels(pred_file)

    # 添加classifier_type字段
    for item, label in zip(json_data, pred_labels):
        item["classifier_type"] = mapping[label]

    # 写入更新后的json文件
    write_json(output_file, json_data)

# 调用主函数
mapping_file = '../data/angular_filtered/subsets/classification/type_label_mapping.txt'
json_file = '../data/angular_filtered/subsets/test_data.json'
pred_file = '../data/angular_filtered/subsets/classification/angular_test_predictions.txt'
output_file = '../data/angular_filtered/subsets/classification/test_with_classification.json'
main(mapping_file, json_file, pred_file, output_file)

In [8]:
with open('../data/angular_filtered/subsets/classification/test_with_classification.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)

for index, item in enumerate(test_data):
    item['seq_num'] = index

with open('../data/angular_filtered/subsets/classification/test_with_classification.json', 'w', encoding='UTF-8') as f:
    json.dump(test_data, f, indent=4)

In [9]:
folder_path = '../data/angular_filtered/subsets/type_db'
files = glob.glob(os.path.join(folder_path, '*db.json'))

test_diff_loader = JSONLoader(
    file_path='../data/angular_filtered/subsets/classification/test_with_classification.json',
    jq_schema='.[].diff',
    text_content=False)

test_diff_data = test_diff_loader.load()
# test_diff_data = test_diff_loader.load()[:1000]

with open('../data/angular_filtered/subsets/classification/test_with_classification.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)

In [None]:
grouped_data = defaultdict(list)
for entry in test_data:
    grouped_data[entry['classifier_type']].append(entry)
# splitter = RecursiveCharacterTextSplitter.from_language(Language.JS, chunk_size=1000, chunk_overlap=200)
for file in files:
    diff_loader = JSONLoader(
        file_path=file,
        jq_schema='.[].diff',
        text_content=False)
    
    diff_data = diff_loader.load()
    
    msg_loader = JSONLoader(
        file_path=file,
        jq_schema='.[].msg',
        text_content=False)

    msg_data = msg_loader.load()
    # diff_split = []

    # for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
    #     diff_split += splitter.split_documents([doc])
    
    type = file.split('\\')[-1].split('_')[1]
    if os.path.exists(f"../data/angular_filtered/subsets/type_db/rag_{type}_db_e5"):
        db = Chroma(persist_directory=f"../data/angular_filtered/subsets/type_db/rag_{type}_db_e5", embedding_function=embeddings)
    else:
        db = Chroma.from_documents(diff_data, embeddings, persist_directory=f"../data/angular_filtered/subsets/type_db/rag_{type}_db_e5")
    # db = Chroma(persist_directory=f"../data/angular_filtered/subsets/type_db/rag_{type}_db_e5", embedding_function=embeddings)
    similar_diff = []
    retriever = db.as_retriever()
    indexs = [item['seq_num'] for item in grouped_data[type]]
    for index in tqdm(indexs, total=len(indexs), desc="Processing documents"):
        similar_diff.append(retriever.invoke(test_diff_data[index].page_content)[0])
    
    for sim_diff, test_diff in zip(similar_diff, grouped_data[type]):
        test_diff['sim_msg'] = msg_data[sim_diff.metadata['seq_num']-1].page_content
        test_diff['sim_diff'] = diff_data[sim_diff.metadata['seq_num']-1].page_content

In [12]:
with open('../data/angular_filtered/subsets-v1/test_data.json', 'r', encoding='UTF-8') as f:
    test_data = json.load(f)

In [13]:
len(test_data)

5636