In [1]:
import pandas as pd

def get_id(dir_path='cmg-data/split-data', type='randomly'):
    with open(f'{dir_path}/{type}/train_id.txt') as file:
        train_id = [line.rstrip() for line in file]
    with open(f'{dir_path}/{type}/test_id.txt') as file:
        test_id = [line.rstrip() for line in file]
    return train_id, test_id

df = pd.read_parquet(f'cmg-data/cmg-data-processed.parquet', engine='fastparquet')
train_id, test_id = get_id(dir_path='cmg-data/split-data', type='cross_project')
train, test = df.loc[df['index'].isin(train_id)], df.loc[df['index'].isin(test_id)]

In [2]:
import json
with open('diff_pdg.json') as f:
    diffler = json.load(f)

In [7]:
import nltk
from nltk import WordNetLemmatizer, pos_tag, WordPunctTokenizer, data
from nltk.corpus import wordnet
from tqdm import tqdm
import re

def write_string_to_file(absolute_filename, string):
    with open(absolute_filename, 'w') as fout:
        fout.write(string)

def word_tokenizer(sentence):
    words = WordPunctTokenizer().tokenize(sentence)
    return words

source_seqs = list()
target_seqs = list()
lang_seqs = list()

indexs = train['index'].unique()

for index in tqdm(indexs):
    df_commit = train[train['index']==index]
    # diffs = list()
    langs = list()
    source_seq = ''
    diff_pdgs = diffler[index]
    diff_list = diff_pdgs.split(' <nl> ')
    len(diff_list)
    
    for _, row in df_commit.iterrows():
        type = row['new_path_file'].split('.')[-1]
        if type in ['c', 'h']:
            langs.append('c')
        else:
            langs.append('cpp')
        
        if row['old_path_file'] != None:
            old_f = word_tokenizer(row['old_path_file'])
            source_seq += 'mmm ' + ' '.join(old_f) + ' <nl> '
        
        if row['old_path_file'] != None:
            new_f = word_tokenizer(row['new_path_file'])
            source_seq += 'ppp ' + ' '.join(new_f) + ' <nl> '
        label_words = row['label'].split()
        target_seq = ' '.join(label_words)
        
    for diff in diff_list:
        diffs = list()
        for l in diff.splitlines():
            l = re.sub('@@.+?@@', '', l)
            l = re.sub(r'\s+', ' ', l)
            if len(l) <= 0:
                continue
            words = word_tokenizer(l)
            diffs.append(' '.join(words))
        source_seq += ' <nl> '.join(diffs)
    # print(source_seq)
    # break
    source_seqs.append(source_seq)
    target_seqs.append(target_seq)
    lang_seqs.append(' '.join(langs))

write_string_to_file(f'CMG-data/cmg.train.diff', '\n'.join(source_seqs[:23172]))
write_string_to_file(f'CMG-data/cmg.train.msg', '\n'.join(target_seqs[:23172]))
write_string_to_file(f'CMG-data/cmg.train.lang', '\n'.join(lang_seqs[:23172]))

write_string_to_file(f'CMG-data/cmg.valid.diff', '\n'.join(source_seqs[23172:]))
write_string_to_file(f'CMG-data/cmg.valid.msg', '\n'.join(target_seqs[23172:]))
write_string_to_file(f'CMG-data/cmg.valid.lang', '\n'.join(lang_seqs[23172:]))

100%|██████████| 25747/25747 [01:50<00:00, 232.82it/s]


In [8]:
source_seqs = list()
target_seqs = list()
lang_seqs = list()

indexs = test['index'].unique()

for index in tqdm(indexs):
    df_commit = test[test['index']==index]
    langs = list()
    source_seq = ''
    diff_pdgs = diffler[index]
    diff_list = diff_pdgs.split(' <nl> ')
    len(diff_list)
    
    for _, row in df_commit.iterrows():
        type = row['new_path_file'].split('.')[-1]
        if type in ['c', 'h']:
            langs.append('c')
        else:
            langs.append('cpp')
        
        if row['old_path_file'] != None:
            old_f = word_tokenizer(row['old_path_file'])
            source_seq += 'mmm ' + ' '.join(old_f) + ' <nl> '
        
        if row['old_path_file'] != None:
            new_f = word_tokenizer(row['new_path_file'])
            source_seq += 'ppp ' + ' '.join(new_f) + ' <nl> '
        label_words = row['label'].split()
        target_seq = ' '.join(label_words)
        
    for diff in diff_list:
        diffs = list()
        for l in diff.splitlines():
            l = re.sub('@@.+?@@', '', l)
            l = re.sub(r'\s+', ' ', l)
            if len(l) <= 0:
                continue
            words = word_tokenizer(l)
            diffs.append(' '.join(words))
        source_seq += ' <nl> '.join(diffs)

    source_seqs.append(source_seq)
    target_seqs.append(target_seq)
    lang_seqs.append(' '.join(langs))

write_string_to_file(f'CMG-data/cmg.test.diff', '\n'.join(source_seqs))
write_string_to_file(f'CMG-data/cmg.test.msg', '\n'.join(target_seqs))
write_string_to_file(f'CMG-data/cmg.test.lang', '\n'.join(lang_seqs))

100%|██████████| 6406/6406 [00:12<00:00, 527.52it/s]


In [11]:
# Chuyen cmg vaild data thanh Race data
!python data_processing/preprocess.py

build: 100%|████████████████████████████████| 2575/2575 [00:27<00:00, 93.43it/s]
2575


In [12]:
# Chuyen cmg train data thanh Race data
!python data_processing/preprocess.py --diff_filename CMG-data/cmg.train.diff --msg_filename CMG-data/cmg.train.msg --lang_filename CMG-data/cmg.train.lang

build: 100%|██████████████████████████████| 23172/23172 [04:36<00:00, 83.74it/s]
23172


In [13]:
# Chuyen cmg test data thanh Race data
!python data_processing/preprocess.py --diff_filename CMG-data/cmg.test.diff --msg_filename CMG-data/cmg.test.msg --lang_filename CMG-data/cmg.test.lang

build: 100%|████████████████████████████████| 6406/6406 [01:10<00:00, 90.74it/s]
6406


In [22]:
!cp data_processing/saved_process/cmg.test.jsonl dataset/cpp/contextual_medits/test.jsonl
!cp data_processing/saved_process/cmg.train.jsonl dataset/cpp/contextual_medits/train.jsonl
!cp data_processing/saved_process/cmg.valid.jsonl dataset/cpp/contextual_medits/valid.jsonl

In [24]:
!bash run.sh cpp

saved_model/codet5/cpp/
^C


In [25]:
!python evalNMTS/eval.py --prd_dir evalNMTS/data/nngen.cmg.test.msg --gold_dir evalNMTS/data/cmg.test.msg 

predict lines:  6406
refs lines:  6406
EM = 8.57
precs = 25.392
recall = 25.431
Meteor: 10.698227282299198
ROUGE-L: 14.822569048428564
Total: 6406
Total: 6406
  bleu-4 = 16.618 
  bleu-normal = 16.618 
Bleu-B-Norm:  16.617827909354563
