In [ ]:
from huggingface_hub import hf_hub_download, snapshot_download
import pyarrow.parquet as pq
import glob
import os
from tqdm.auto import tqdm
import json
import pickle

In [None]:
snapshot_download(repo_id="JetBrains-Research/commit-chronicle", repo_type="dataset", cache_dir='./')

In [ ]:
folder_path = './datasets--JetBrains-Research--commit-chronicle/snapshots/5fd076e67b812a9f3d1999e5e40f71715f84bb51/data'  # 文件夹的路径

df = pq.read_table(folder_path + '/train-00000-of-00061-2a7ccc8e843f5f5b.parquet').to_pandas()

In [ ]:


# Create empty DataFrames for each language
languages = ['Java', 'Python', 'C#', 'C++', 'JavaScript']
dfs = {lang: [] for lang in languages}

# Create a tqdm progress bar for each language
bars = {lang: tqdm(total=100000, desc=lang) for lang in languages}

folder_path = './datasets--JetBrains-Research--commit-chronicle/snapshots/5fd076e67b812a9f3d1999e5e40f71715f84bb51/data'  # 文件夹的路径

files = glob.glob(os.path.join(folder_path, 'train*'))
for file in files:
    df = pq.read_table(file).to_pandas()

    # Iterate over each language
    for lang in languages:
        # Filter rows where language column matches the current language
        lang_df = df[(df['language'] == lang) & (df['mods'].apply(len) == 1) & (df['mods'].apply(lambda x: x[0]['change_type']) == 'MODIFY') & (df['mods'].apply(lambda x: len(f"diff --git a/{x[0]['old_path']} b/{x[0]['new_path']} {x[0]['diff']}") <= 3000))]

        # Iterate over each row in the filtered DataFrame
        for index, row in lang_df.iterrows():
            diff = row['mods'][0]
            old_path = 'a/' + diff['old_path']
            new_path = 'b/' + diff['new_path']
            diff_content = diff['diff']  # assume diff_content is an empty string
            item = {
                'msg': row['message'],
                'diff': f"diff --git {old_path} {new_path} {diff_content}",
                'lang': lang
            }
            dfs[lang].append(item)
            bars[lang].update(1)

            # Check if the language has reached 100,000 rows
            if len(dfs[lang]) >= 100000:
                print(f"Reached 100,000 rows for {lang}")
                languages.remove(lang)  # Remove language from list to avoid further processing
                break
    # Break out of the loop if all languages have reached 100,000 rows
    if not languages:
        break

In [ ]:
# Create a single list of all items
all_items = [item for lang in dfs for item in dfs[lang]]

# Dump the data to a JSON file
with open('../data/chronicle/chronicle_rag_db.json', 'w') as f:
    json.dump(all_items, f, indent=4)

In [ ]:
data = {}
for index, item in enumerate(all_items):
    data[index+1] = item['msg']

with open("../data/chronicle/rag_msg.pkl", "wb") as f:
    pickle.dump(data, f)