In [1]:
import re
def check_angular_convention(msg, has_space=True, strict=True):
    types = '((build)|(ci)|(docs)|(feat)|(fix)|(perf)|(refactor)|(style)|(test)|(chore))'
    # types = '((perf))'
    if has_space:
        pattern = f'{types}\\s(\\((\\s|\\S)+\\)\\s)?:\\s(\\s|\\S)+'
    else:
        pattern = f'{types}(\\((\\s|\\S)+\\))?:\\s\\S+(\\s|\\S)+'
    if not strict:
        pattern = '^((build)|(ci)|(docs)|(feat)|(fix)|(perf)|(refactor)|(style)|(test)|(chore))'
    return re.match(pattern, msg) is not None



In [9]:
import pyarrow.parquet as pq
import glob
import os
from tqdm.auto import tqdm
import re

def check_angular_convention(msg, has_space=True, strict=True):
    types = '((build)|(ci)|(docs)|(feat)|(fix)|(perf)|(refactor)|(style)|(test)|(chore))'
    # types = '((perf))'
    if has_space:
        pattern = f'{types}\\s(\\((\\s|\\S)+\\)\\s)?:\\s(\\s|\\S)+'
    else:
        pattern = f'{types}(\\((\\s|\\S)+\\))?:\\s\\S+(\\s|\\S)+'
    if not strict:
        pattern = '^((build)|(ci)|(docs)|(feat)|(fix)|(perf)|(refactor)|(style)|(test)|(chore))'
    return re.match(pattern, msg) is not None

# Define the types for the angular convention
angular_types = ['build', 'ci', 'docs', 'feat', 'fix', 'perf', 'refactor', 'style', 'test', 'chore']
# angular_types = ['perf']

# Create empty dictionaries for each language and type
languages = ['JavaScript']
dfs = {lang: {atype: [] for atype in angular_types} for lang in languages}

# Create a tqdm progress bar for each language
bars = {lang: tqdm(total=100000, desc=lang) for lang in languages}  # Total: 10 types * 10000 each

folder_path = '../rag/datasets--JetBrains-Research--commit-chronicle/snapshots/5fd076e67b812a9f3d1999e5e40f71715f84bb51/data'  # 文件夹的路径

files = glob.glob(os.path.join(folder_path, 'train*')) + glob.glob(os.path.join(folder_path, 'validation*')) + glob.glob(os.path.join(folder_path, 'test*')) 

for file in files:
    df = pq.read_table(file).to_pandas()

    # Iterate over each language
    for lang in languages[:]:
        lang_suffix = '.js'

        # Filter rows where language column matches the current language and additional conditions
        lang_df = df[(df['language'] == lang) & 
                     (df['mods'].apply(len) == 1) & 
                    #  (df['message'].apply(len) <= 150) & 
                     (df['mods'].apply(lambda x: x[0]['change_type']) == 'MODIFY') &
                     (df['mods'].apply(lambda x: len(f"diff --git a/{x[0]['old_path']} b/{x[0]['new_path']} {x[0]['diff']}") <= 5000))] 
                    #  (df['mods'].apply(lambda x: ( (x[0]['old_path'].count(lang_suffix) if x[0]['old_path'] else 0) == 1 and (x[0]['new_path'].count(lang_suffix) if x[0]['new_path'] else 0) == 1 and (x[0]['old_path'].count('.json') if x[0]['old_path'] else 0) == 0 and (x[0]['new_path'].count('.json') if x[0]['new_path'] else 0) == 0 )))]

        # Iterate over each row in the filtered DataFrame
        for index, row in lang_df.iterrows():
            msg = row['message']
            if check_angular_convention(msg, has_space=True) or check_angular_convention(msg, has_space=False):
                diff = row['mods'][0]
                old_path = 'a/' + diff['old_path']
                new_path = 'b/' + diff['new_path']
                diff_content = diff['diff']  # assume diff_content is an empty string
                item = {
                    'msg': row['message'],
                    'diff': f"diff --git {old_path} {new_path} {diff_content}",
                    'date': row['date'],
                    'repo': row['repo']
                }

                # Find the type in the message
                for atype in angular_types:
                    if msg.startswith(atype):
                        if len(dfs[lang][atype]) < 10000:
                            dfs[lang][atype].append(item)
                            bars[lang].update(1)
                        break

            # Check if all types have reached 1000 rows
            if all(len(dfs[lang][atype]) >= 10000 for atype in angular_types):
                print(f"Reached 10000 rows for all types in {lang}")
                languages.remove(lang)  # Remove language from list to avoid further processing
                break

    # Break out of the loop if all languages have reached the required number of rows
    if not languages:
        break

JavaScript:   0%|          | 0/100000 [00:00<?, ?it/s]

In [10]:
# Initialize a dictionary to hold counts for each type
type_counts = {lang: {atype: 0 for atype in angular_types} for lang in languages}

# Iterate over the collected data to count occurrences of each type
for lang, types in dfs.items():
    for atype, items in types.items():
        type_counts[lang][atype] = len(items)

# Print the counts
for lang, counts in type_counts.items():
    print(f"Language: {lang}")
    for atype, count in counts.items():
        print(f"  Type: {atype}, Count: {count}")

Language: JavaScript
  Type: build, Count: 1360
  Type: ci, Count: 1585
  Type: docs, Count: 10000
  Type: feat, Count: 10000
  Type: fix, Count: 10000
  Type: perf, Count: 519
  Type: refactor, Count: 7283
  Type: style, Count: 1112
  Type: test, Count: 4480
  Type: chore, Count: 10000


In [11]:
from sklearn.model_selection import train_test_split
import pandas as pd
import json
# Split the data and save to files
output_dir = '../data/angular_filtered/subsets'
os.makedirs(output_dir, exist_ok=True)

for lang, types in dfs.items():
    for atype, items in types.items():
        if items:  # Ensure there are items to split
            # Split the data into 1:9 ratio for test and db
            db_items, test_items = train_test_split(items, test_size=0.1, random_state=42)
            # Save to files
            # Save to JSON files
            with open(os.path.join(output_dir, f'type_test/{lang}_{atype}_test.json'), 'w') as test_file:
                json.dump(test_items, test_file, indent=4)
            
            with open(os.path.join(output_dir, f'type_db/{lang}_{atype}_db.json'), 'w') as db_file:
                json.dump(db_items, db_file, indent=4)

print("Data splitting and saving completed.")

Data splitting and saving completed.


In [12]:
import os
import json
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Define the input and output directories
input_dir = '../data/angular_filtered/subsets/type_test'
output_dir = '../data/angular_filtered/subsets'
os.makedirs(output_dir, exist_ok=True)

# Initialize dictionaries to hold the data by type and count occurrences
data_by_type = {}
type_counts = defaultdict(int)

# Read all the test.json files
for file_name in os.listdir(input_dir):
    if file_name.endswith('_test.json'):
        _, atype, _ = file_name.split('_')
        file_path = os.path.join(input_dir, file_name)
        
        # Load the JSON data
        with open(file_path, 'r') as file:
            items = json.load(file)
        
        for item in items:
            item['type'] = atype
        
        # Organize data by type and count occurrences
        if atype not in data_by_type:
            data_by_type[atype] = []
        data_by_type[atype].extend(items)
        type_counts[atype] += len(items)

# Initialize lists to hold combined test_dev and test_all items
test_dev_items = []
test_all_items = []

# Split the data by type into test_dev and test_all
for atype, items in data_by_type.items():
    if items:
        # Split the data into 1:9 ratio for test_dev and test_all
        dev_items, all_items = train_test_split(items, test_size=0.9, random_state=42)
        
        # Add the split data to the combined lists
        test_dev_items.extend(dev_items)
        test_all_items.extend(all_items)

# Save the combined data to JSON files
with open(os.path.join(output_dir, 'test_dev.json'), 'w') as test_dev_file:
    json.dump(test_dev_items, test_dev_file, indent=4)

test_all_items.extend(test_dev_items)
with open(os.path.join(output_dir, 'test_data.json'), 'w') as test_all_file:
    json.dump(test_all_items, test_all_file, indent=4)

print("Data re-splitting and saving completed.")

# Print the counts of each type in test_dev.json
print("Counts of each type in test_dev.json:")
for atype, count in type_counts.items():
    # Calculate the expected count in test_dev.json
    dev_count = count // 10
    print(f"Type: {atype}, Count: {dev_count}")


Data re-splitting and saving completed.
Counts of each type in test_dev.json:
Type: build, Count: 13
Type: chore, Count: 100
Type: ci, Count: 15
Type: docs, Count: 100
Type: feat, Count: 100
Type: fix, Count: 100
Type: perf, Count: 5
Type: refactor, Count: 72
Type: style, Count: 11
Type: test, Count: 44


In [18]:
import glob
import os
import json
output_dir = '../data/angular_filtered/subsets/type_db'
# Find all test JSON files in the output directory
test_files = glob.glob(os.path.join(output_dir, '*_db.json'))

all_test_items = []

# Read and merge all test JSON files
for file in test_files:
    with open(file, 'r') as f:
        items = json.load(f)
        all_test_items.extend(items)

# Save all merged test items to a single JSON file
with open(os.path.join('../data/angular_filtered/subsets', 'db_data.json'), 'w') as test_file:
    json.dump(all_test_items, test_file, indent=4)

print("Merging completed.")

Merging completed.


In [16]:
import json
import glob
import os
import random
random.seed(42)

# Define the directory containing the JSON files
output_dir = '../data/angular_filtered/subsets/classification'

# List of types and their corresponding labels
types = [
    'build', 'ci', 'docs', 'feat', 'fix', 
    'perf', 'refactor', 'style', 'test', 'chore'
]
type_label_mapping = {type_name: idx for idx, type_name in enumerate(types)}

# Save the type-label mapping to a txt file
with open(os.path.join(output_dir, 'type_label_mapping.txt'), 'w') as mapping_file:
    for type_name, label in type_label_mapping.items():
        mapping_file.write(f'{type_name}: {label}\n')

all_test_items = []

# Read and process all test JSON files
for type_name in types:
    test_file = os.path.join('../data/angular_filtered/subsets/type_test', f'JavaScript_{type_name}_test.json')
    if os.path.exists(test_file):
        with open(test_file, 'r') as f:
            items = json.load(f)
            for item in items:
                all_test_items.append({
                    "code": item['diff'],
                    "label": type_label_mapping[type_name]
                })

# Shuffle the items to randomize their order
random.shuffle(all_test_items)

# Save all items to a single JSONL file
with open(os.path.join(output_dir, 'test.jsonl'), 'w') as jsonl_file:
    for item in all_test_items:
        jsonl_file.write(json.dumps(item) + '\n')

print("Merging and conversion to JSONL completed.")

Merging and conversion to JSONL completed.


In [17]:
import json
import glob
import os
from sklearn.model_selection import train_test_split
import random
random.seed(42)

# Define the directory containing the JSON files
output_dir = '../data/angular_filtered/subsets/classification'

# List of types and their corresponding labels
types = [
    'build', 'ci', 'docs', 'feat', 'fix', 
    'perf', 'refactor', 'style', 'test', 'chore'
]
type_label_mapping = {type_name: idx for idx, type_name in enumerate(types)}

train_items = []
valid_items = []

# Read and process all test JSON files
for type_name in types:
    test_file = os.path.join('../data/angular_filtered/subsets/type_db', f'JavaScript_{type_name}_db.json')
    if os.path.exists(test_file):
        with open(test_file, 'r') as f:
            items = json.load(f)
            data = [
                {
                    "code": item['diff'],
                    "label": type_label_mapping[type_name]
                }
                for item in items
            ]
            # Split data into 1:8 ratio
            train_split, valid_split = train_test_split(data, test_size=1/9, random_state=42)
            valid_items.extend(valid_split)
            train_items.extend(train_split)

# Shuffle the items to randomize their order
random.shuffle(valid_items)
random.shuffle(train_items)

# Save validation items to valid.jsonl
with open(os.path.join(output_dir, 'valid.jsonl'), 'w') as valid_file:
    for item in valid_items:
        valid_file.write(json.dumps(item) + '\n')

# Save training items to train.jsonl
with open(os.path.join(output_dir, 'train.jsonl'), 'w') as train_file:
    for item in train_items:
        train_file.write(json.dumps(item) + '\n')

print("Data splitting and saving to JSONL files completed.")


Data splitting and saving to JSONL files completed.


In [None]:
from collections import defaultdict
# Initialize a dictionary to count the number of items for each type
type_counts = defaultdict(int)

# Count the number of items for each type
for item in valid_items:
    label = item["label"]
    type_name = types[label]
    type_counts[type_name] += 1

# Output the counts to a file
count_output_file = os.path.join(output_dir, 'valid_type_counts.txt')
with open(count_output_file, 'w') as f:
    for type_name, count in type_counts.items():
        f.write(f'{type_name}: {count}\n')
print("Item counts for each type in the training data have been computed and saved.")

In [None]:
# Initialize a dictionary to hold counts for each type
type_counts = {lang: {atype: 0 for atype in angular_types} for lang in languages}

# Iterate over the collected data to count occurrences of each type
for lang, types in dfs.items():
    for atype, items in types.items():
        type_counts[lang][atype] = len(items)

# Print the counts
for lang, counts in type_counts.items():
    print(f"Language: {lang}")
    for atype, count in counts.items():
        print(f"  Type: {atype}, Count: {count}")

In [None]:
# Initialize a dictionary to hold max diff length for each language
max_diff_lengths = {lang: {atype: 0 for atype in angular_types} for lang in dfs.keys()}

# Iterate over the collected data to find the max diff length for each type
for lang, types in dfs.items():
    for atype, items in types.items():
        if items:  # Ensure there are items to check
            max_diff_lengths[lang][atype] = max(len(item['diff']) for item in items)

# Print the max diff lengths
for lang, lengths in max_diff_lengths.items():
    print(f"Language: {lang}")
    for atype, length in lengths.items():
        print(f"  Type: {atype}, Max Diff Length: {length}")


In [4]:
import json
import os
import random
# List of types and their corresponding labels
types = [
    'build', 'ci', 'docs', 'feat', 'fix', 
    'perf', 'refactor', 'style', 'test', 'chore'
]
type_label_mapping = {type_name: idx for idx, type_name in enumerate(types)}
all_test_items = []

# Read and process all test JSON files
test_file = os.path.join('../data/angular_filtered/subsets', 'dev_test.json')
if os.path.exists(test_file):
    with open(test_file, 'r') as f:
        items = json.load(f)
        for item in items:
            all_test_items.append({
                "code": item['diff'],
                "label": type_label_mapping[item['type']]
            })

# Shuffle the items to randomize their order
random.shuffle(all_test_items)

# Save all items to a single JSONL file
with open(os.path.join('../data/angular_filtered/subsets/classification', 'angular_dev_test.jsonl'), 'w') as jsonl_file:
    for item in all_test_items:
        jsonl_file.write(json.dumps(item) + '\n')

print("Merging and conversion to JSONL completed.")

Merging and conversion to JSONL completed.
