In [5]:
from stanfordcorenlp import StanfordCoreNLP
from nltk.tokenize import sent_tokenize
import json
from utils import is_vdo_pattern, is_merge_rollback, tokenize_summary, remove_brackets, replace_issue_id

In [6]:
nlp = StanfordCoreNLP('http://localhost', port=9000)

In [7]:
def commit_processer(msg, nlp):
    ## get the first sentence
    ## remove issue id
    ## remove merge, rollback commits and commits with a diff larger than 1 mb
    ## broke reference messages into tokens
    ## Max length for summary. Default is 30.
    msg = sent_tokenize(msg.strip().replace('\n', '. '))
    if msg is None or msg == []:
        return '', 0
    first_sent = msg[0]
    if is_merge_rollback(first_sent):
        return '', 0
    else:
        first_sent = replace_issue_id(first_sent)
        # first_sent = remove_brackets(first_sent)
        if first_sent is None or first_sent == '':
            return '', 0
        first_sent = tokenize_summary(first_sent)
        if len(first_sent.split()) > 30 or not is_vdo_pattern(first_sent, nlp):
            return '', 0
        else:
            return first_sent, 1

In [8]:
def to_lemma(msg, nlp):
    props={'annotators': 'lemma', 'outputFormat': 'json', 'timeout': 1000}
    msg_list = msg.split()
    annot_doc = nlp.annotate(msg,properties=props)
    parsed_dict = json.loads(annot_doc)
    lemma_list = [v for d in parsed_dict['sentences'][0]['tokens'] for k, v in d.items() if k == 'lemma']
    msg_list[0] = lemma_list[0]
    msg = ' '.join(msg_list)
    return msg

In [16]:
import pyarrow.parquet as pq
import glob
import os
from tqdm.auto import tqdm
# Create empty dictionaries for each language and type
languages = ['JavaScript']
verb_groups = ['add', 'fix', 'remove', 'update', 'use', 'move', 'prepare', 'improve', 'ignore', 'handle', 'rename', 'allow', 'set', 'revert', 'replace']

dfs = {lang: {vtype: [] for vtype in verb_groups} for lang in languages}

# Create a tqdm progress bar for each language
bars = {lang: tqdm(total=60000, desc=lang) for lang in languages}  # Total: 10 types * 10000 each

folder_path = '../rag/datasets--JetBrains-Research--commit-chronicle/snapshots/5fd076e67b812a9f3d1999e5e40f71715f84bb51/data'  # 文件夹的路径


files = glob.glob(os.path.join(folder_path, 'train*')) + glob.glob(os.path.join(folder_path, 'validation*')) + glob.glob(os.path.join(folder_path, 'test*')) 

for file in files:
    df = pq.read_table(file).to_pandas()

    # Iterate over each language
    for lang in languages[:]:
        # Filter rows where language column matches the current language and additional conditions
        lang_df = df[(df['language'] == lang) & 
                     (df['mods'].apply(len) == 1) & 
                    #  (df['message'].apply(len) <= 150) & 
                     (df['mods'].apply(lambda x: x[0]['change_type']) == 'MODIFY') &
                     (df['mods'].apply(lambda x: len(f"diff --git a/{x[0]['old_path']} b/{x[0]['new_path']} {x[0]['diff']}") <= 5000))] 
                    #  (df['mods'].apply(lambda x: ( (x[0]['old_path'].count(lang_suffix) if x[0]['old_path'] else 0) == 1 and (x[0]['new_path'].count(lang_suffix) if x[0]['new_path'] else 0) == 1 and (x[0]['old_path'].count('.json') if x[0]['old_path'] else 0) == 0 and (x[0]['new_path'].count('.json') if x[0]['new_path'] else 0) == 0 )))]

        # Iterate over each row in the filtered DataFrame
        for index, row in lang_df.iterrows():
            msg = row['message']
            msg = to_lemma(msg, nlp)
            if msg.split()[0] in verb_groups:
                if commit_processer(msg, nlp)[1] == 1:
                    diff = row['mods'][0]
                    old_path = 'a/' + diff['old_path']
                    new_path = 'b/' + diff['new_path']
                    diff_content = diff['diff']  # assume diff_content is an empty string
                    item = {
                        'msg': msg,
                        'diff': f"diff --git {old_path} {new_path} {diff_content}",
                        'date': row['date'],
                        'repo': row['repo']
                    }
                    
                    # Find the type in the message
                    for vtype in verb_groups:
                        if msg.startswith(vtype):
                            if len(dfs[lang][vtype]) < 4000:
                                dfs[lang][vtype].append(item)
                                bars[lang].update(1)
                            break
        
            if all(len(dfs[lang][vtype]) >= 4000 for vtype in verb_groups):
                print(f"Reached 4000 rows for all types in {lang}")
                languages.remove(lang)  # Remove language from list to avoid further processing
                break

    # Break out of the loop if all languages have reached the required number of rows
    if not languages:
        break

JavaScript:   0%|          | 0/60000 [00:00<?, ?it/s]

In [19]:
for lang in dfs:
    print(f"Language: {lang}")
    for vtype in dfs[lang]:
        print(f"  {vtype}: {len(dfs[lang][vtype])}")


Language: JavaScript
  add: 4000
  fix: 4000
  remove: 4000
  update: 4000
  use: 2025
  move: 591
  prepare: 299
  improve: 3899
  ignore: 727
  handle: 1526
  rename: 1597
  allow: 2117
  set: 2330
  revert: 824
  replace: 1786


In [17]:
data = []

for lang, types in dfs.items():
    for vtype, items in types.items():
        for item in items:
            # item['msg'] = to_lemma(item['msg'], nlp)
            data.append(item)

In [18]:
import json

# Dump the data to a JSON file
with open('../data/vdo_filtered/lemma_data_js_new.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, indent=4)

In [72]:
dfs[lang]['add'][0]

{'msg': 'Adds an config sample',
 'diff': 'diff --git a/test/stubs/.eleventyignore b/test/stubs/.eleventyignore ignoredFolder\n-ignoredFolder/ignored.md\n\\ No newline at end of file\n+./ignoredFolder/ignored.md\n\\ No newline at end of file\n',
 'date': '10.12.2017 12:41:51',
 'repo': '11ty/eleventy'}

In [47]:
import json
from collections import Counter

# 读取JSON文件
with open('../data/vdo_filtered/test_data_js.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

first_words = [item['msg'].split()[0].lower() for item in data if 'msg' in item]

word_counts = Counter(first_words)

top_10_words = word_counts.most_common(10)
for word, count in top_10_words:
    print(f'{word} {count}')

add 3227
remove 952
added 592
update 394
fix 215
adding 182
updated 172
removed 170
improve 156
replace 117


In [59]:
from tqdm.auto import tqdm
props={'annotators': 'lemma', 'outputFormat': 'json', 'timeout': 1000 }
with open('../data/vdo_filtered/test_data_js.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
for item in tqdm(data, total=len(data)):
    msg = item['msg']
    msg_list = msg.split()
    annot_doc = nlp.annotate(msg,properties=props)
    parsed_dict = json.loads(annot_doc)
    lemma_list = [v for d in parsed_dict['sentences'][0]['tokens'] for k, v in d.items() if k == 'lemma']
    msg_list[0] = lemma_list[0]
    msg = ' '.join(msg_list)
    item['msg'] = msg

  0%|          | 0/10000 [00:00<?, ?it/s]

In [1]:
import json
import random

def extract_random_items(input_file, output_file, num_items_per_type=10):
    # 读取 JSON 文件
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # 分类 items
    item_dict = {}
    for item in data:
        msg = item['msg']
        first_word = msg.split()[0] if msg else ''
        if first_word not in item_dict:
            item_dict[first_word] = []
        item_dict[first_word].append(item)
    
    # 随机选择每种类型的 items
    selected_items = []
    for key in item_dict:
        if len(item_dict[key]) > num_items_per_type:
            selected_items.extend(random.sample(item_dict[key], num_items_per_type))
        else:
            selected_items.extend(item_dict[key])
    
    # 写入新的 JSON 文件
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(selected_items, f, ensure_ascii=False, indent=4)

# 调用函数
extract_random_items('../data/vdo_filtered/lemma_data_js.json', '../data/vdo_filtered/lemma_dev_test_150.json')

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from collections import defaultdict
# split data into train, dev, test according to the type of commit message(8:1:1)

import json
with open('../data/vdo_filtered/lemma_data_js_new.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
    
data_dict = defaultdict(list)
for item in data:
    msg = item['msg']
    first_word = msg.split()[0] if msg else ''
    data_dict[first_word].append(item)

In [4]:
for key, items in data_dict.items():
    print(len(items))
    break

4000


In [6]:
# Split the data and save to files
import os
output_dir = '../data/vdo_filtered/classification'
os.makedirs(output_dir, exist_ok=True)

train_all = []
dev_all = []
test_all = []
for key, items in data_dict.items():
    train_data, dev_test_data = train_test_split(items, test_size=0.2, random_state=42)
    dev_data, test_data = train_test_split(dev_test_data, test_size=0.5, random_state=42)
    
    train_all.extend(train_data)
    dev_all.extend(dev_data)
    test_all.extend(test_data)

# Save the data to files
with open(os.path.join(output_dir, 'train_data.json'), 'w', encoding='utf-8') as file:
    json.dump(train_all, file, ensure_ascii=False, indent=4)

with open(os.path.join(output_dir, 'dev_data.json'), 'w', encoding='utf-8') as file:
    json.dump(dev_all, file, ensure_ascii=False, indent=4)

with open(os.path.join(output_dir, 'test_data.json'), 'w', encoding='utf-8') as file:
    json.dump(test_all, file, ensure_ascii=False, indent=4)

In [7]:
len(train_all), len(dev_all), len(test_all)

(26972, 3373, 3376)

In [12]:
db_data = []
db_data.extend(train_all)
db_data.extend(dev_all)
with open(os.path.join('../data/vdo_filtered', 'db_data.json'), 'w', encoding='utf-8') as file:
    json.dump(db_data, file, ensure_ascii=False, indent=4)

In [13]:
# print the number of each type in the test data
import json
from collections import Counter

# 读取JSON文件
with open('../data/vdo_filtered/classification/test_data.json', 'r', encoding='utf-8') as file:
    test_data = json.load(file)


In [14]:
dev_test = []
data_dict = defaultdict(list)
for item in test_data:
    msg = item['msg']
    first_word = msg.split()[0] if msg else ''
    data_dict[first_word].append(item)

for key, items in data_dict.items():
    train_data, dev_test_data = train_test_split(items, test_size=1/6, random_state=42)
    dev_test.extend(dev_test_data)

with open('../data/vdo_filtered/dev_test_data.json', 'w', encoding='utf-8') as file:
    json.dump(dev_test, file, ensure_ascii=False, indent=4)

In [16]:

# Define the directory containing the JSON files
output_dir = '../data/vdo_filtered/classification'

# List of types and their corresponding labels
types = [
    'add', 'fix', 'remove', 'update', 'use', 
    'move', 'prepare', 'improve', 'ignore', 'handle',
    'rename', 'allow', 'set', 'revert', 'replace'
]
type_label_mapping = {type_name: idx for idx, type_name in enumerate(types)}

# Save the type-label mapping to a txt file
with open(os.path.join(output_dir, 'type_label_mapping.txt'), 'w') as mapping_file:
    for type_name, label in type_label_mapping.items():
        mapping_file.write(f'{type_name}: {label}\n')

In [23]:
# assign type label to train, dev, test data
import json
import os

with open('../data/vdo_filtered/dev_test_data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

for item in data:
    item['type'] = item['msg'].split()[0]

with open('../data/vdo_filtered/dev_test_data.json', 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

In [1]:
import os
import json

# 读取JSON文件
input_file = '../data/vdo_filtered/db_data.json'
with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# 创建存储分类文件的文件夹
output_folder = '../data/vdo_filtered/type_db'
os.makedirs(output_folder, exist_ok=True)

# 初始化一个字典来存储不同类型的数据
type_data = {}

# 遍历所有数据，根据type字段分类
for item in data:
    item_type = item['type']
    if item_type not in type_data:
        type_data[item_type] = []
    type_data[item_type].append(item)

# 将分类后的数据分别写入不同的JSON文件
for item_type, items in type_data.items():
    output_file = os.path.join(output_folder, f'{item_type}_db.json')
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(items, f, ensure_ascii=False, indent=4)

print("分类完成并已保存到type_db文件夹下。")

分类完成并已保存到type_db文件夹下。
