In [None]:
!pip install langchain
!pip install jq
!pip install sentence-transformers

In [None]:
!pip install redis

Collecting redis
  Downloading redis-5.0.3-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.8/251.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: redis
Successfully installed redis-5.0.3


In [None]:
from huggingface_hub import hf_hub_download, snapshot_download

In [None]:
snapshot_download(repo_id="JetBrains-Research/commit-chronicle", repo_type="dataset", cache_dir='./')

In [None]:
import pandas as pd
import pyarrow.parquet as pq
import glob
import os
from tqdm import tqdm

# Create empty DataFrames for each language
languages = ['Java', 'Python', 'C#', 'C++', 'JavaScript']
dfs = {lang: [] for lang in languages}

# Create a tqdm progress bar for each language
bars = {lang: tqdm(total=100000, desc=lang, unit='items') for lang in languages}

folder_path = './datasets--JetBrains-Research--commit-chronicle/snapshots/5fd076e67b812a9f3d1999e5e40f71715f84bb51/data'  # 文件夹的路径

files = glob.glob(os.path.join(folder_path, 'train*'))
for file in files:
    df = pq.read_table(file).to_pandas()

    # Iterate over each language
    for lang in languages:
        # Filter rows where language column matches the current language
        lang_df = df[(df['language'] == lang) & (df['mods'].apply(len) == 1) & (df['mods'].apply(lambda x: x[0]['change_type']) == 'MODIFY')]

        # Iterate over each row in the filtered DataFrame
        for index, row in lang_df.iterrows():
            diff = row['mods'][0]
            old_path = 'a/' + diff['old_path']
            new_path = 'b/' + diff['new_path']
            diff_content = diff['diff']  # assume diff_content is an empty string
            item = {
                'msg': row['message'],
                'diff': f"diff --git {old_path} {new_path} {diff_content}",
                'lang': lang
            }
            dfs[lang].append(item)
            bars[lang].update(1)
            bars[lang].write(f"Processing {lang}...")  # Write the progress to the bar

            # Check if the language has reached 100,000 rows
            if len(dfs[lang]) >= 100000:
                print(f"Reached 100,000 rows for {lang}")
                languages.remove(lang)  # Remove language from list to avoid further processing
                break
    # Break out of the loop if all languages have reached 100,000 rows
    if not languages:
        break






Java:   0%|          | 0/100000 [00:00<?, ?items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:00<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:00<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:00<?, ?items/s][A[A[A[A[A[A[A[A








Java:  11%|█▏        | 11396/100000 [01:36<12:33, 117.59it/s] 
Python:  17%|█▋        | 17086/100000 [01:36<07:50, 176.30it/s] 
C#:   3%|▎         | 2711/100000 [01:36<57:57, 27.97it/s] 
C++:  17%|█▋        | 16853/100000 [01:36<07:58, 173.90it/s]  
JavaScript:   6%|▌         | 5679/100000 [01:36<26:49, 58.60it/s]  





Java:   0%|          | 1/100000 [00:03<84:36:24,  3.05s/items][A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 1/100000 [00:03<84:36:24,  3.05s/items][A[A[A[A[A





Python:   0%|          | 0/100000 [00:03<?, ?items/s][A

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


Java:   0%|          | 7/100000 [00:03<13:19:27,  2.08items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A[A[A




Java:   0%|          | 8/100000 [00:03<7:30:28,  3.70items/s] [A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 8/100000 [00:03<7:30:28,  3.70items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:03<?

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


JavaScript:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A[A[A




Java:   0%|          | 15/100000 [00:03<3:08:44,  8.83items/s][A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 15/100000 [00:03<3:08:44,  8.83items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 16/100000 [00:03<3:08:44,  8.83items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:03<?, ?items/s][

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


[A[A[A[A[A





Python:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A[A[A




Java:   0%|          | 21/100000 [00:03<2:03:50, 13.46items/s][A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 21/100000 [00:03<2:03:50, 13.46items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A




Processing Java...
Processing Java...
Processing Java...
Processing Java...


[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 25/100000 [00:03<1:46:36, 15.63items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:03<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 26/100000 [00:04<1:46:36, 15.63items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/10

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


[A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 32/100000 [00:04<1:22:29, 20.20items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 33/100000 [

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


C++:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 38/100000 [00:04<1:12:06, 23.11items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 39/100000 [00:04<1:12:05, 23.11items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:04<?, ?items/s]

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


[A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 45/100000 [00:04<1:05:29, 25.44items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 46/100000 [

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...









[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 51/100000 [00:04<1:01:52, 26.92items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 52/100000 [00:04<1:01:52, 26.92items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:04<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|         

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


Python:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 58/100000 [00:05<58:32, 28.45items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A

Processing Java...
Processing Java...
Processing Java...
Processing Java...


C++:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 62/100000 [00:05<55:32, 29.99items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 63/100000 [00:05<55:32, 29.99items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:05<?, ?items/s][A[

Processing Java...
Processing Java...
Processing Java...










[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 65/100000 [00:05<1:31:01, 18.30items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 66/100000 [00:05<1:31:01, 18.30items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:05<?, ?items

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


JavaScript:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 70/100000 [00:05<1:29:20, 18.64items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:05<?, ?items/s][A[A[A[A[A[A[A[A[A




Java:   0%|          | 71/100000 [00:05<1:29:07, 18.69items/s][A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 71/100000 [00:05<1:29:07, 18.69items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:05<?, ?items/s][

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


[A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A[A[A




Java:   0%|          | 75/100000 [00:06<1:30:14, 18.46items/s][A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 75/100000 [00:06<1:30:14, 18.46items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


[A[A[A[A[A





Python:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 80/100000 [00:06<1:32:29, 18.00items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A[A[A




Java:   0%|          | 81/100000 [00:06<1:28:42, 18.77items/s][A[A[A[A[A




[A[A[A[A[A




Processing Java...
Processing Java...
Processing Java...
Processing Java...









[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 84/100000 [00:06<1:33:26, 17.82items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A[A[A




Java:   0%|          | 85/100000 [00:06<2:09:07, 12.90items/s][A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 85/100000 [00:06<2:09:07, 12.90items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:06<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


[A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 89/100000 [00:07<1:45:00, 15.86items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A[A




Java:   0%|          | 90/100000 [00:07<1:47:47, 15.45items/s][A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 90/100000 [00:07<1:47:47, 15.45items/s][A[A[A[A[A





Python:   0%|         

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 95/100000 [00:07<1:33:33, 17.80items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 96/100000 [00:07<1:33:33, 17.80items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:07<?, ?items/s][A[

Processing Java...
Processing Java...
Processing Java...


[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 98/100000 [00:07<1:18:30, 21.21items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 99/100000 [00:07<1:18:30, 21.21items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A








JavaScri

Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...
Processing Java...


[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 105/100000 [00:07<1:16:57, 21.63items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 106/100000 [00:07<1:16:57, 21.63items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/

Processing Java...
Processing Java...
Processing Java...


[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 108/100000 [00:07<1:07:50, 24.54items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:07<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 109/100000 [00:08<1:07:50, 24.54items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:08<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:08<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:08<?, ?items/s][A[A[A[A[A[A[A[A








JavaSc

Processing Java...
Processing Java...
Processing Java...
Processing Java...








Python:   0%|          | 0/100000 [00:08<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:08<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:08<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:08<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




Java:   0%|          | 112/100000 [00:08<1:33:42, 17.77items/s][A[A[A[A[A





Python:   0%|          | 0/100000 [00:08<?, ?items/s][A[A[A[A[A[A






C#:   0%|          | 0/100000 [00:08<?, ?items/s][A[A[A[A[A[A[A







C++:   0%|          | 0/100000 [00:08<?, ?items/s][A[A[A[A[A[A[A[A








JavaScript:   0%|          | 0/100000 [00:08<?, ?items/s][A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A

Processing Java...
Processing Java...


[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A

KeyboardInterrupt: 

In [None]:
dfs['C#'][0]

{'sg': 'add known external unity packages to gitignore',
 'diff': 'diff --git a/src/SpectatorView.Unity/.gitignore b/src/SpectatorView.Unity/.gitignore ',
 'lang': 'C#'}

In [None]:
import pickle

data = {1: "Removed assignment from conditional statement", 2: "sentence 2", 3: "sentence 3"}

with open("rag.pkl", "wb") as f:
    pickle.dump(data, f)

In [None]:
with open("rag.pkl", "rb") as f:
    data = pickle.load(f)

In [None]:
from langchain_community.document_loaders import JSONLoader

In [None]:
diff_loader = JSONLoader(
    file_path='../data/rag_data.json',
    jq_schema='.[].diff',
    text_content=False)

diff_data = diff_loader.load()

msg_loader = JSONLoader(
    file_path='../data/rag_data.json',
    jq_schema='.[].msg',
    text_content=False)

msg_data = msg_loader.load()

language_loader = JSONLoader(
    file_path='../data/rag_data.json',
    jq_schema='.[].language',
    text_content=False)

language_data = language_loader.load()

In [None]:
# 1-1000 data use java splitter, 1000-2000 use cpp splitter, 2000-3000 use csharp splitter, 3000-4000 use python splitter, 4000-5000 use javascript splitter
from langchain.text_splitter import Language
from langchain.text_splitter import RecursiveCharacterTextSplitter

languages = [Language.JAVA, Language.CPP, Language.CSHARP, Language.PYTHON, Language.JS]
splitters = [RecursiveCharacterTextSplitter.from_language(language, chunk_size=300, chunk_overlap=0) for language in languages]

language_dict = {'java': 0, 'cpp': 1, 'csharp': 2, 'python': 3, 'javascript': 4}

In [None]:
from tqdm import tqdm
diff_split = []
for i, doc in tqdm(enumerate(diff_data), total=len(diff_data), desc="Processing documents"):
    diff_split += splitters[language_dict[language_data[i].page_content]].split_documents([doc])

Processing documents: 100%|██████████| 500000/500000 [01:08<00:00, 7344.55it/s]


In [None]:
len(diff_split)

2774108

In [None]:
diff_split[0]

Document(page_content='diff --git a/butterknife-runtime/src/main/java/butterknife/internal/Utils.java \nppp b/butterknife-runtime/src/main/java/butterknife/internal/Utils.java', metadata={'source': 'D:\\TU Delft\\thesis\\LLM_CMG\\llm4commit\\data\\rag_data.json', 'seq_num': 1})

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cuda'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [None]:
from langchain_community.vectorstores import Chroma

In [None]:
db = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)

In [None]:

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
db = Chroma.from_documents(diff_data, embeddings, persist_directory="./chroma_db")


KeyboardInterrupt



In [None]:
db.persist()

In [None]:
test_diff_loader = JSONLoader(
    file_path='../data/msg_nngen_nmt_codebert_chatgpt.json',
    jq_schema='.[].diff',
    text_content=False)

test_diff_data = test_diff_loader.load()

In [None]:
def similarity_search(documents):
    # Initialize an empty dictionary to store aggregate scores for each candidate ID
    aggregate_scores = {}

    # Iterate through each document in the documents list
    for document in documents:
        # Apply similarity search function to the document
        results = db.similarity_search_with_relevance_scores(document.page_content, score_threshold=0.0)
        # Iterate through the results for each document
        for candidate_doc in results:
            id = candidate_doc[0].metadata['seq_num']
            score = candidate_doc[1]
            # Update the aggregate score for the candidate ID
            aggregate_scores[id] = aggregate_scores.get(id, 0) + score

    # Find the candidate ID with the highest aggregate score
    max_candidate_id = max(aggregate_scores, key=aggregate_scores.get)
    return max_candidate_id - 1

In [None]:
retriever = db.as_retriever()

In [None]:
test_diff_data[0].page_content

'diff --git a/MPChartLib/src/com/github/mikephil/charting/renderer/PieChartRenderer.java  b/MPChartLib/src/com/github/mikephil/charting/renderer/PieChartRenderer.java \npublic void drawValues(Canvas c){ \npublic void drawExtras(Canvas c){// drawCircles(c); \ndrawHole(c);-c.drawBitmap(mDrawBitmap,0,0,mRenderPaint);+c.drawBitmap(mDrawBitmap,0,0,null); \ndrawCenterText(c); \n} \n'

In [None]:
retriever.get_relevant_documents(test_diff_data[0].page_content)[0].metadata['seq_num']

69489

In [None]:
from tqdm import tqdm

In [None]:
similar_diff = []
for i, test_data in tqdm(enumerate(test_diff_data[:100]), total=len(test_diff_data[:100]), desc="Processing documents"):
    query = test_data.page_content
    similar_diff.append(retriever.get_relevant_documents(query)[0])

Processing documents: 100%|██████████| 100/100 [00:01<00:00, 68.64it/s]


In [None]:
len(similar_diff)

100

In [None]:
msg_data[similar_diff[0].metadata['seq_num']-1].page_content

"Change Rendeder ' s access level modifiers for extra draw methods"

In [None]:
import json
data = []

for sim_diff, test_diff in zip(similar_diff, test_diff_data[:100]):
    item = {
        'sim_msg': msg_data[sim_diff.metadata['seq_num']-1].page_content,
        'sim_diff': sim_diff.page_content,
        'org_diff': test_diff.page_content
    }
    data.append(item)

In [None]:
# Write the data to a JSON file
with open('../data/test_data_with_rag_100.json', 'w', encoding='UTF-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
query =test_diff_data[0].page_content
documents = splitters[0].create_documents([query])

In [None]:
docs = retriever.get_relevant_documents(documents[0].page_content)

In [None]:
db.similarity_search_with_relevance_scores(documents[0].page_content)[1]

(Document(page_content='diff --git a/MPChartLib/src/main/java/com/github/mikephil/charting/renderer/BubbleChartRenderer.java \nppp b/MPChartLib/src/main/java/com/github/mikephil/charting/renderer/BubbleChartRenderer.java \nprotected void drawDataSet(Canvas c,IBubbleDataSet dataSet){ \nif(! mViewPortHandler.isInBoundsRight(pointBuffer[0]- shapeHalf)) \nbreak;- final int color=dataSet.getColor(( int)entry.getX());+final int color=dataSet.getColor(j); \nmRenderPaint.setColor(color); \nc.drawCircle(pointBuffer[0], pointBuffer[1], shapeHalf,mRenderPaint); \n', metadata={'seq_num': 27766, 'source': 'D:\\TU Delft\\thesis\\LLM_CMG\\llm4commit\\data\\rag_data.json'}),
 0.736459085862412)

Processing documents: 100%|██████████| 5000/5000 [1:46:31<00:00,  1.28s/it]  


In [None]:
with open('../data/msg_nngen_nmt_codebert_chatgpt.json', 'r', encoding='UTF-8') as f:
    org_data = json.load(f)

for item, msg in zip(org_data, gpt_msg):
    item['chatgpt_rag'] = msg

output_file = '../data/msg_nngen_nmt_codebert_chatgpt_rag.json'
with open(output_file, 'w', encoding='UTF-8') as f:
    json.dump(org_data, f, ensure_ascii=False, indent=4)

In [None]:
len(gpt_msg)

5000