# Build angular format dataset

## Total_data

In [1]:
import pyarrow.parquet as pq
import glob
import os
from tqdm.auto import tqdm
import re

def check_angular_convention(msg, has_space=True):
    types = '((build)|(ci)|(docs)|(feat)|(fix)|(perf)|(refactor)|(style)|(test)|(chore))'
    # types = '((perf))'
    if has_space:
        pattern = f'{types}\\s(\\((\\s|\\S)+\\)\\s)?:\\s(\\s|\\S)+'
    else:
        pattern = f'{types}(\\((\\s|\\S)+\\))?:\\s\\S+(\\s|\\S)+'
    # if not strict:
    #     pattern = '^((build)|(ci)|(docs)|(feat)|(fix)|(perf)|(refactor)|(style)|(test)|(chore))'
    return re.match(pattern, msg) is not None

# Define the types for the angular convention
angular_types = ['build', 'ci', 'docs', 'feat', 'fix', 'perf', 'refactor', 'style', 'test', 'chore']
# angular_types = ['perf']

# Create empty dictionaries for each language and type
languages = ['JavaScript']
dfs = {lang: {atype: [] for atype in angular_types} for lang in languages}

# Create a tqdm progress bar for each language
bars = {lang: tqdm(total=100000, desc=lang) for lang in languages}  # Total: 10 types * 10000 each

folder_path = '../rag/datasets--JetBrains-Research--commit-chronicle/snapshots/5fd076e67b812a9f3d1999e5e40f71715f84bb51/data'  # 文件夹的路径

# files = glob.glob(os.path.join(folder_path, 'train*')) + glob.glob(os.path.join(folder_path, 'validation*'))
files = glob.glob(os.path.join(folder_path, 'train*')) + glob.glob(os.path.join(folder_path, 'validation*'))

for file in files:
    df = pq.read_table(file).to_pandas()

    # Iterate over each language
    for lang in languages[:]:
        lang_suffix = '.js'

        # Filter rows where language column matches the current language and additional conditions
        lang_df = df[(df['language'] == lang) & 
                     (df['mods'].apply(len) == 1) & 
                     (df['message'].apply(lambda x: len(x.split())) <= 30) & 
                     (df['mods'].apply(lambda x: x[0]['change_type']) == 'MODIFY') &
                     (df['mods'].apply(lambda x: len(f"diff --git a/{x[0]['old_path']} b/{x[0]['new_path']} {x[0]['diff']}") <= 5000))] 
                    #  (df['mods'].apply(lambda x: ( (x[0]['old_path'].count(lang_suffix) if x[0]['old_path'] else 0) == 1 and (x[0]['new_path'].count(lang_suffix) if x[0]['new_path'] else 0) == 1 and (x[0]['old_path'].count('.json') if x[0]['old_path'] else 0) == 0 and (x[0]['new_path'].count('.json') if x[0]['new_path'] else 0) == 0 )))]

        # Iterate over each row in the filtered DataFrame
        for index, row in lang_df.iterrows():
            msg = row['message']
            if check_angular_convention(msg, has_space=True) or check_angular_convention(msg, has_space=False):
                diff = row['mods'][0]
                old_path = 'a/' + diff['old_path']
                new_path = 'b/' + diff['new_path']
                diff_content = diff['diff']  # assume diff_content is an empty string
                item = {
                    'msg': row['message'],
                    'diff': f"diff --git {old_path} {new_path} {diff_content}",
                    'date': row['date'],
                    'repo': row['repo']
                }

                # Find the type in the message
                for atype in angular_types:
                    if msg.startswith(atype):
                        if len(dfs[lang][atype]) < 10000:
                            dfs[lang][atype].append(item)
                            bars[lang].update(1)
                        break

            # Check if all types have reached 1000 rows
            if all(len(dfs[lang][atype]) >= 10000 for atype in angular_types):
                print(f"Reached 10000 rows for all types in {lang}")
                languages.remove(lang)  # Remove language from list to avoid further processing
                break

    # Break out of the loop if all languages have reached the required number of rows
    if not languages:
        break

JavaScript:   0%|          | 0/100000 [00:00<?, ?it/s]

In [3]:
db_data = []

for lang, types in dfs.items():
    for atype, items in types.items():
        if items:  # Ensure there are items to split
            for item in items:
                item['type'] = atype
            db_data.extend(items)

import json
output_dir = '../data/angular_filtered/subsets'
with open(os.path.join(output_dir, f'db_data.json'), 'w') as db_file:
    json.dump(db_data, db_file, indent=4)

In [4]:
import pandas as pd
import json
# Split the data and save to files
output_dir = '../data/angular_filtered/subsets'
os.makedirs(output_dir, exist_ok=True)

for lang, types in dfs.items():
    for atype, items in types.items():
        if items:  # Ensure there are items to split
            with open(os.path.join(output_dir, f'type_db/{lang}_{atype}_db.json'), 'w') as db_file:
                json.dump(items, db_file, indent=4)

print("Data splitting and saving completed.")

Data splitting and saving completed.


In [13]:
len(db_data)

53855

In [2]:
# Initialize a dictionary to hold counts for each type
type_counts = {lang: {atype: 0 for atype in angular_types} for lang in languages}

# Iterate over the collected data to count occurrences of each type
for lang, types in dfs.items():
    for atype, items in types.items():
        type_counts[lang][atype] = len(items)

# Print the counts
for lang, counts in type_counts.items():
    print(f"Language: {lang}")
    for atype, count in counts.items():
        print(f"  Type: {atype}, Count: {count}")

Language: JavaScript
  Type: build, Count: 1076
  Type: ci, Count: 1287
  Type: docs, Count: 9672
  Type: feat, Count: 10000
  Type: fix, Count: 10000
  Type: perf, Count: 472
  Type: refactor, Count: 6574
  Type: style, Count: 883
  Type: test, Count: 3891
  Type: chore, Count: 10000


In [None]:
# from sklearn.model_selection import train_test_split
# import pandas as pd
# import json
# # Split the data and save to files
# output_dir = '../data/angular_filtered/subsets'
# os.makedirs(output_dir, exist_ok=True)
# 
# test_data = []
# db_data = []
# 
# for lang, types in dfs.items():
#     for atype, items in types.items():
#         if items:  # Ensure there are items to split
#             # Split the data into 1:9 ratio for test and db
#             for item in items:
#                 item['type'] = atype
#             db_items, test_items = train_test_split(items, test_size=0.1, random_state=42)
#             test_data.extend(test_items)
#             db_data.extend(db_items)
#             # Save to files
#             # Save to JSON files
#             with open(os.path.join(output_dir, f'type_test/{lang}_{atype}_test.json'), 'w') as test_file:
#                 json.dump(test_items, test_file, indent=4)
#             
#             with open(os.path.join(output_dir, f'type_db/{lang}_{atype}_db.json'), 'w') as db_file:
#                 json.dump(db_items, db_file, indent=4)
# 
# print("Data splitting and saving completed.")
# output_dir = '../data/angular_filtered/subsets'
# with open(os.path.join(output_dir, f'db_data.json'), 'w') as db_file:
#     json.dump(db_data, db_file, indent=4)
#     
# with open(os.path.join(output_dir, f'test_data.json'), 'w') as db_file:
#     json.dump(test_data, db_file, indent=4)

## to get final test data(about 2000)

In [4]:
import json
from sklearn.model_selection import train_test_split

# 读取JSON文件
with open('../data/angular_filtered/subsets/test_data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# 按 type 字段构建字典
type_dict = {}
for item in data:
    item_type = item['type']
    if item_type not in type_dict:
        type_dict[item_type] = []
    type_dict[item_type].append(item)

# 按比例切分数据
# train_data = []
test_data = []

for item_type, items in type_dict.items():
    train, test = train_test_split(items, test_size=0.25, random_state=42)
    # train_data.extend(train)
    test_data.extend(test)

# # 将训练集和测试集分别保存为JSON文件
# with open('train_data.json', 'w', encoding='utf-8') as file:
#     json.dump(train_data, file, ensure_ascii=False, indent=4)

with open('../data/angular_filtered/subsets/dev_test.json', 'w', encoding='utf-8') as file:
    json.dump(test_data, file, ensure_ascii=False, indent=4)

## Test_data

In [5]:
import pyarrow.parquet as pq
import glob
import os
from tqdm.auto import tqdm
import re

def check_angular_convention(msg, has_space=True, strict=True):
    types = '((build)|(ci)|(docs)|(feat)|(fix)|(perf)|(refactor)|(style)|(test)|(chore))'
    if has_space:
        pattern = f'{types}\\s(\\((\\s|\\S)+\\)\\s)?:\\s(\\s|\\S)+'
    else:
        pattern = f'{types}(\\((\\s|\\S)+\\))?:\\s\\S+(\\s|\\S)+'
    return re.match(pattern, msg) is not None

# Define the types for the angular convention
angular_types = ['build', 'ci', 'docs', 'feat', 'fix', 'perf', 'refactor', 'style', 'test', 'chore']

# Define the counts from your data
counts = {
    'build': 1076,
    'ci': 1287,
    'docs': 9672,
    'feat': 10000,
    'fix': 10000,
    'perf': 472,
    'refactor': 6574,
    'style': 883,
    'test': 3891,
    'chore': 10000
}

# Calculate the total count
total_count = sum(counts.values())

# Define the total number of samples you want to collect
total_samples = 2000

# Calculate the target count for each type based on the given counts and total_samples
target_counts = {atype: int((count / total_count) * total_samples) for atype, count in counts.items()}

# Create empty dictionaries for each language and type
languages = ['JavaScript']
dfs = {lang: {atype: [] for atype in angular_types} for lang in languages}
counters = {lang: {atype: 0 for atype in angular_types} for lang in languages}

# Create a tqdm progress bar for each language
bars = {lang: tqdm(total=total_samples, desc=lang) for lang in languages}

folder_path = '../rag/datasets--JetBrains-Research--commit-chronicle/snapshots/5fd076e67b812a9f3d1999e5e40f71715f84bb51/data'  # 文件夹的路径

files = glob.glob(os.path.join(folder_path, 'test*')) 

for file in files:
    df = pq.read_table(file).to_pandas()

    # Iterate over each language
    for lang in languages[:]:
        lang_suffix = '.js'

        # Filter rows where language column matches the current language and additional conditions
        lang_df = df[(df['language'] == lang) & 
                     (df['mods'].apply(len) == 1) & 
                    (df['message'].apply(lambda x: len(x.split())) <= 30) & 
                     (df['mods'].apply(lambda x: x[0]['change_type']) == 'MODIFY') &
                     (df['mods'].apply(lambda x: len(f"diff --git a/{x[0]['old_path']} b/{x[0]['new_path']} {x[0]['diff']}") <= 5000))]

        # Iterate over each row in the filtered DataFrame
        for index, row in lang_df.iterrows():
            msg = row['message']
            if check_angular_convention(msg, has_space=True) or check_angular_convention(msg, has_space=False):
                diff = row['mods'][0]
                old_path = 'a/' + diff['old_path']
                new_path = 'b/' + diff['new_path']
                diff_content = diff['diff']
                item = {
                    'msg': row['message'],
                    'diff': f"diff --git {old_path} {new_path} {diff_content}",
                    'date': row['date'],
                    'repo': row['repo']
                }

                # Find the type in the message
                for atype in angular_types:
                    if msg.startswith(atype) and counters[lang][atype] < target_counts[atype]:
                        dfs[lang][atype].append(item)
                        counters[lang][atype] += 1
                        bars[lang].update(1)
                        break

            # Check if all types have reached their target counts
            if all(counters[lang][atype] >= target_counts[atype] for atype in angular_types):
                print(f"Reached target counts for all types in {lang}")
                languages.remove(lang)  # Remove language from list to avoid further processing
                break

    # Break out of the loop if all languages have reached the required number of rows
    if not languages:
        break

JavaScript:   0%|          | 0/2000 [00:00<?, ?it/s]

Reached target counts for all types in JavaScript


In [11]:
target_counts

{'build': 39,
 'ci': 47,
 'docs': 359,
 'feat': 371,
 'fix': 371,
 'perf': 17,
 'refactor': 244,
 'style': 32,
 'test': 144,
 'chore': 371}

In [12]:
# Print the number of items collected for each type
for lang in dfs:
    for atype in dfs[lang]:
        print(f"Type: {atype}, Count: {len(dfs[lang][atype])}")

Type: build, Count: 39
Type: ci, Count: 47
Type: docs, Count: 359
Type: feat, Count: 371
Type: fix, Count: 371
Type: perf, Count: 17
Type: refactor, Count: 244
Type: style, Count: 32
Type: test, Count: 144
Type: chore, Count: 371


In [14]:
test_data = []

for lang, types in dfs.items():
    for atype, items in types.items():
        if items:  # Ensure there are items to split
            for item in items:
                item['type'] = atype
            test_data.extend(items)

import json
output_dir = '../data/angular_filtered/subsets'
with open(os.path.join(output_dir, f'test_data.json'), 'w') as db_file:
    json.dump(test_data, db_file, indent=4)

# Model generation

## RACE

In [3]:
import json
from tqdm.auto import tqdm
with open('../data/angular_filtered/subsets/test_data.json') as f:
    prompt_data = json.load(f)
    
import os
os.environ['HF_HOME'] = '../models/'

from transformers import pipeline
pipe = pipeline("text2text-generation", model="JetBrains-Research/cmg-race-without-history", device=0)

In [4]:

diffs = []
generated_commit_messages = []

for commit in prompt_data:
    diff = commit['diff']
    diffs.append(diff)

for diff in tqdm(diffs, total=len(diffs), desc='Generating commit messages'):
    prompt = f"""
    The following is a diff which describes the code changes in a commit, Your task is to write a short commit message accordingly.
    {diff}
    According to the diff, the commit message should be:
    """
    generated_commit_messages.append(pipe(diff)[0]['generated_text'])

Generating commit messages:   0%|          | 0/1995 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1485 > 512). Running this sequence through the model will result in indexing errors
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\tyfann\anaconda3\envs\llm4commit\lib\logging\__init__.py", line 1100, in emit
    msg = self.format(record)
  File "C:\Users\tyfann\anaconda3\envs\llm4commit\lib\logging\__init__.py", line 943, in format
    return fmt.format(record)
  File "C:\Users\tyfann\anaconda3\envs\llm4commit\lib\logging\__init__.py", line 678, in format
    record.message = record.getMessage()
  File "C:\Users\tyfann\anaconda3\envs\llm4commit\lib\logging\__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "C:\Users\tyfann\anaconda3\envs\llm4commit\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\tyfann\a

In [5]:

for item, msg in zip(prompt_data, generated_commit_messages):
    item['race'] = msg

output_file = '../data/angular_filtered/subsets/generation/test_race_v1.json'
with open(output_file, 'w', encoding='UTF-8') as f:
    json.dump(prompt_data, f, ensure_ascii=False, indent=4)

# save msg to a file
with open("../data/angular_filtered/subsets/generation/test_race_v1.txt", 'w', encoding='UTF-8') as file:
    for item in prompt_data:
        file.write(item['race'].replace('\n', '\\n').replace('\r', '\\r') + '\n')

## ChatGPT

In [1]:
from langchain import hub
import json
from tqdm import tqdm
prompt = hub.pull("tyfann/llm4commit-zeroshot")

from openai import OpenAI

client = OpenAI(
    api_key="",
    base_url="https://api.chatanywhere.cn/v1"
)

def gpt_35_api(messages: list):

    completion = client.chat.completions.create(model="gpt-3.5-turbo-0125", messages=messages, temperature=0)
    return completion.choices[0].message.content

with open('../data/angular_filtered/subsets/test_data.json', 'r', encoding='UTF-8') as f:
    org_data = json.load(f)

gpt_msg = []
for index, data in tqdm(enumerate(org_data), total=len(org_data), desc="Processing documents"):
    # merged_diff = '\n'.join(diff['diff'] for diff in data['diff'])
    messages = prompt.invoke(
        {"DIFF": data['diff']}
    ).to_messages()
    example_prompt = [{'role': 'user','content': messages[0].content},]
    try:
        gpt_msg.append(gpt_35_api(example_prompt))
    except:
        print(index)
        gpt_msg.append("")

for item, msg in zip(org_data, gpt_msg):
    item['chatgpt_zeroshot'] = msg
import os
# output_file = '../data/chronicle/rag_baseline/zeroshot/rag_baseline_python_chatgpt.json'
output_file = '../data/angular_filtered/subsets/generation/test_gpt35_zeroshot.json'
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, 'w', encoding='UTF-8') as f:
    json.dump(org_data, f, ensure_ascii=False, indent=4)

output_file = '../data/angular_filtered/subsets/generation/test_gpt35_zeroshot.txt'
with open(output_file, 'w', encoding='UTF-8') as f:
    for item in org_data:
        f.write(item['chatgpt_zeroshot'].replace('\n', '\\n').replace('\r', '\\r') + '\n')

Processing documents: 100%|██████████| 1995/1995 [24:50<00:00,  1.34it/s]


In [2]:
output_file = '../data/angular_filtered/subsets/generation/test_ref.txt'
with open(output_file, 'w', encoding='UTF-8') as f:
    for item in org_data:
        f.write(item['msg'].replace('\n', '\\n').replace('\r', '\\r') + '\n')

## NNGen

In [1]:
import re

def convert_diff(diff_output):
    # Replace diff --git a/ with mmm a/
    diff_output = re.sub(r'diff --git a/(.*) b/(.*)', r'mmm a/\1\nppp b/\1', diff_output)
    # Replace newline characters with <nl>
    diff_output = diff_output.replace('\n', '<nl>')
    # Split the output into individual words and join them back together with spaces
    words = re.split(r'(\W)', diff_output)
    result = ' '.join(words)
    # Replace multiple spaces with a single space
    result = re.sub(r'\s+', ' ', result)
    result = result.replace('< nl >', '<nl>')
    return result

In [2]:
import json
with open(f'../data/angular_filtered/subsets/db_data.json') as f:
    db_data = json.load(f)

for item in db_data:
    item['diff'] = convert_diff(item['diff'])

# save diff to .diff file
with open('../data/angular_filtered/subsets/generation/nngen/train.diff', 'w') as f:
    for item in db_data:
        f.write(item['diff'] + '\n')

with open('../data/angular_filtered/subsets/generation/nngen/train.msg', 'w') as f:
    for item in db_data:
        escaped_string = item['msg'].replace("\n", "\\n").replace("\r", "\\r")
        f.write(escaped_string + "\n")

with open(f'../data/angular_filtered/subsets/test_data.json') as f:
    test_data = json.load(f)

for item in test_data:
    item['diff'] = convert_diff(item['diff'])

# save diff to .diff file
with open('../data/angular_filtered/subsets/generation/nngen/test.diff', 'w') as f:
    for item in test_data:
        f.write(item['diff'] + '\n')

In [3]:
import json
with open('../data/angular_filtered/subsets/test_data.json', 'r', encoding='UTF-8') as f:
    org_data = json.load(f)

output_file = '../data/angular_filtered/subsets/generation/test_nngen.txt'
with open(output_file, 'r', encoding='UTF-8') as f:
    lines = f.readlines()
    for i in range(len(org_data)):
        org_data[i]['nngen'] = lines[i].strip()

with open('../data/angular_filtered/subsets/generation/test_nngen.json', 'w', encoding='UTF-8') as f:
    json.dump(org_data, f, indent=4)