# Prepare Environment

We should check if we're on the Colab and do additional setup
- Install `fairseq`, `tqdm`

In [4]:
from IPython.core import getipython

is_colab = 'google.colab' in str(getipython.get_ipython())

if is_colab:
    !git clone https://github.com/vquanghuy/breakdown-bifi
    !cp -r breakdown-bifi/utils .
    !pip install fairseq editdistance
else:
  print("Notebook is not on Colab. Fairseq installation not attempted.")


Notebook is not on Colab. Fairseq installation not attempted.


In [58]:
%load_ext autoreload

# Check PyTorch version
import torch
print('Torch', torch.__version__)

import fairseq
print('fairseq', fairseq.__version__)

import shlex
import subprocess
import sys
import shutil
import os
import io
import sys
import json, os, re
import token
import numpy as np
from collections import defaultdict, OrderedDict, Counter
from copy import deepcopy
import editdistance

sys.path.insert(0, 'utils')  # Replace with the actual path

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Torch 2.2.2
fairseq 0.12.2


# Data Preparation

In [59]:
DATA_DIR = 'drive/MyDrive/Dataset/bifi-dataset' if is_colab else 'data'
os.environ["DATA_DIR"] = DATA_DIR

# Supported functions

In [74]:
%autoreload 2

from tqdm import tqdm
from copy import deepcopy
from collections import defaultdict, OrderedDict
import hashlib

from utils.code_error_checker import check_paren_error, check_ast_error
from utils.code_utils import preprocess_unk, code_toks_to_code_string, get_diff_metric, tokenize_python_code
from utils.fairseq_utils import parse_fairseq_preds, fairseq_preprocess, fairseq_generate, fairseq_train

## eval_fixer

In [5]:
def eval_one_pred_obj(pred_obj):
    # Deal with UNK
    _, unk_dict = preprocess_unk(pred_obj['code_toks_raw'])
    anonymize_dict = pred_obj['anonymize_dict']
    if anonymize_dict is None:
        anonymize_dict = {}
    anonymize_dict['<unk>'] = unk_dict
    anonymize_dict['<STRING>'] = []
    anonymize_dict['<COMMENT>'] = []
    #
    src = pred_obj['src'] #this is tok_format i.e. ' '.join(code_toks)
    src_code  = code_toks_to_code_string(src, anonymize_dict) #this is string_format
    ret_obj = {'progid': pred_obj['progid'],
               'orig_err_obj': pred_obj['orig_err_obj'],
               'anonymize_dict': pred_obj['anonymize_dict']
               }
    ret_obj['src']  = {'tok_format': src, 'string_format': src_code}
    #Get string_format from predicted code toks
    ret_obj['pred'] = []
    for pred in pred_obj['pred']:
        pred_code = code_toks_to_code_string(pred, anonymize_dict) #this is string_format
        orig_err_obj = pred_obj['orig_err_obj']
        if orig_err_obj['msg'] == 'unbalanced (){}[]':
            #NOTE: `pred` is tok_format i.e. ' '.join(code_toks)
            res = check_paren_error(pred.split())
        else:
            res = check_ast_error(pred_code)
        diff_metric = get_diff_metric(src, pred)
        ret_obj['pred'].append({'tok_format': pred,
                                'string_format': pred_code,
                                'err_obj': res,
                                'diff_metric': diff_metric})
    return ret_obj

def eval_one_split(pred_dir_prefix, split, pred_fname, n_workers=80):
    pred_dir   = f'{pred_dir_prefix}{split}'
    pred_path = Path(f'{pred_dir}/{pred_fname}')
    preds = parse_fairseq_preds(str(pred_path))
    #load progids
    data_dir = DATA_DIR
    progids = [l.strip() for l in open(f'{data_dir}/orig_bad_code/orig.{split}.id')]
    assert len(preds) == len(progids)
    #load original err_obj
    bads = json.load(open(f'{data_dir}/orig_bad_code/orig.bad.json'))
    for j in range(len(preds)):
        progid = progids[j]
        preds[j]['progid'] = progid
        preds[j]['orig_err_obj'] = bads[progid]['err_obj']
        code_toks_raw = bads[progid]['code_toks_joined'].split()
        anonymize_dict = bads[progid]['anonymize_dict']
        if 'window_span' in bads[progid]:
            ws = bads[progid]['window_span']
            code_toks_raw = code_toks_raw[ws[0]:ws[1]]
            anonymize_dict = None
        preds[j]['code_toks_raw'] = code_toks_raw
        preds[j]['anonymize_dict'] = anonymize_dict
    #
    print ('len(preds)', len(preds))
    # with Pool(n_workers) as p:
    #     res = list(tqdm(p.imap(eval_one_pred_obj, preds), total=len(preds)))
    res = list(tqdm(map(eval_one_pred_obj, preds)))  # or list(tqdm([eval_one_pred_obj(pred) for pred in preds]))

    '''
      res: list of {'progid': , 'orig_err_obj': , 'anonymize_dict': ,
                    'src': {'tok_format': , 'string_format': },
                    'pred': {'tok_format':, 'string_format':, 'err_obj': }
                    }
    '''
    with open(f'{pred_path.parent}/{pred_path.stem}.evaluated.json', 'w') as f:
        json.dump(res, f, indent=2)

def get_test_result(pred_dir_prefix, pred_fname):
    #
    def collate_eval():
        success  = []; denom = 0
        success_by_group = defaultdict(list); denom_by_group = defaultdict(int)
        agg_obj = {}
        for split in {3,4}: #heldout test set
            print ('split', split)
            pred_dir   = Path(f'{pred_dir_prefix}{split}')
            pred_path  = pred_dir/pred_fname
            pred_eval_path = f'{pred_path.parent}/{pred_path.stem}.evaluated.json'
            eval_objs = json.load(open(pred_eval_path))
            for eval_obj in eval_objs:
                progid = eval_obj['progid']
                orig_err_type = eval_obj['orig_err_obj']['msg']
                if 'indent' in orig_err_type:
                    orig_err_type = 'indentation error'
                denom += 1
                denom_by_group[orig_err_type] += 1
                for k, pred_obj in enumerate(eval_obj['pred']):
                    pred_err_obj = pred_obj['err_obj']
                    diff_metric  = pred_obj['diff_metric']
                    if (pred_err_obj == 0) and (0 < diff_metric <= 4):
                        name = '{:02d}-{}-{:03d}'.format(split, progid, k)
                        success.append(name)
                        success_by_group[orig_err_type].append(name)
        return success, denom, success_by_group, denom_by_group
    #
    def print_stats(name_list, _denom):
        top1 = set()
        for name in name_list:
            split, progid, k = name.split('-')
            if int(split) in {3,4}: #test set
                if int(k)==0:
                    top1.add(f'{split}-{progid}')
        acc = len(top1)/float(_denom)*100
        print ('   acc: {} ({:.1f}%) | denom {}'.format(len(top1), acc, _denom))
        return acc
    #
    success, denom, success_by_group, denom_by_group = collate_eval()
    acc_dict = {}
    print ('Total'); acc = print_stats(success, denom); acc_dict['total'] = acc
    print ('-'*50)
    for err_type in success_by_group:
        print (f'{err_type.capitalize()}')
        acc = print_stats(success_by_group[err_type], denom_by_group[err_type])
        acc_dict[err_type] = acc
    json.dump(acc_dict, open(Path(pred_dir_prefix).parent/'stats.json', 'w'), indent=2)

## generate_paired_data_from_fixer
Using *critic* to verify

In [None]:
#BIFI version - uses critic to verify
def generate_paired_data_from_fixer_preds_for_BIFI(pred_dir_prefix, pred_fname, out_dir):
    #Get new paired data
    train_data = {'good': [], 'bad': [], 'id': []}
    for split in {0,1,2}: #available for training
        print ('split', split)
        pred_dir   = Path(f'{pred_dir_prefix}{split}')
        pred_path  = pred_dir/pred_fname
        pred_eval_path = f'{pred_path.parent}/{pred_path.stem}.evaluated.json'
        eval_objs = json.load(open(pred_eval_path))
        for eval_obj in eval_objs:
            progid = eval_obj['progid']
            for k, pred_obj in enumerate(eval_obj['pred']):
                pred_err_obj = pred_obj['err_obj']
                diff_metric  = pred_obj['diff_metric']
                if (pred_err_obj == 0) and (0 < diff_metric <= 4):
                    name = '{:02d}-{}-{:03d}'.format(split, progid, k)
                    src  = eval_obj['src']['tok_format'].strip()
                    pred = pred_obj['tok_format'].strip()
                    train_data['id'  ].append(name)
                    train_data['good'].append(pred)
                    train_data['bad' ].append(src)
    assert len(train_data['good']) == len(train_data['bad']) == len(train_data['id'])
    new_data_size = len(train_data['id'])
    print ('#new_data', new_data_size)
    os.system(f'mkdir -p {out_dir}_pure')
    with open(f'{out_dir}_pure/train.id', 'w') as fid, \
            open(f'{out_dir}_pure/train.good', 'w') as fgood, \
            open(f'{out_dir}_pure/train.bad', 'w') as fbad:
        for _idx in tqdm(range(new_data_size)):
            fid.write(train_data['id'][_idx] +'\n')
            fgood.write(train_data['good'][_idx] +'\n')
            fbad.write(train_data['bad'][_idx] +'\n')
    idxs_newdata = list(range(new_data_size))
    #
    #Merge with round0 paired data
    print ('loading round0 data')
    train_data_0 = {'good': [], 'bad': [], 'id': []}
    train_data_0['bad']  = [line.strip() for line in tqdm(open('data/round0/data_paired/train.bad'))]
    train_data_0['good'] = [line.strip() for line in tqdm(open('data/round0/data_paired/train.good'))]
    train_data_0['id']   = [line.strip() for line in tqdm(open('data/round0/data_paired/train.id'))]
    idxs_0 = list(range(len(train_data_0['id'])))
    seed = (111 + int(hashlib.md5(str(out_dir).encode()).hexdigest(), 16)) % (2**31)
    print ('seed', seed)
    np.random.seed(seed)
    np.random.shuffle(idxs_0); np.random.shuffle(idxs_0)
    total_size = 30_000_000
    _0_data_repeats  = (total_size//3)//len(idxs_0) +1
    new_data_repeats = (total_size*2//3)//new_data_size +1
    idxs_0 = (idxs_0 * _0_data_repeats)[:total_size//3]
    idxs_newdata = idxs_newdata * new_data_repeats
    print ('combining all data')
    idxs = [f'0_{i}' for i in idxs_0] + [f'new_{i}' for i in idxs_newdata]
    np.random.shuffle(idxs); np.random.shuffle(idxs)
    #
    #Write out data
    os.system(f'mkdir -p {out_dir}')
    print ('writing out data')
    with open(f'{out_dir}/train.id', 'w') as fid, \
            open(f'{out_dir}/train.good', 'w') as fgood, \
            open(f'{out_dir}/train.bad', 'w') as fbad:
        for idx in tqdm(idxs):
            _prefix, _idx = idx.split('_')
            _idx = int(_idx)
            if _prefix == '0':
                fid.write(train_data_0['id'][_idx] +'\n')
                fgood.write(train_data_0['good'][_idx] +'\n')
                fbad.write(train_data_0['bad'][_idx] +'\n')
            else:
                fid.write(train_data['id'][_idx] +'\n')
                fgood.write(train_data['good'][_idx] +'\n')
                fbad.write(train_data['bad'][_idx] +'\n')
    os.system('cp {} {}'.format('data/round0/data_paired/dev.bad', out_dir))
    os.system('cp {} {}'.format('data/round0/data_paired/dev.good', out_dir))
    os.system('cp {} {}'.format('data/round0/data_paired/dev.id', out_dir))
    print ('done')

# Round 0

This round is mainly focus on training and evaluate `fixer`

## Round variables

In [61]:
from pathlib import Path

data_dir = Path(DATA_DIR)
round_dir = data_dir/'round_0'

# Preprocess and train
data_paired_dir = round_dir/'small_data_paired' # Force to use smaller dataset - initial is data_paired
fairseq_dir = data_paired_dir/'fairseq_preprocess'

# Run fixer
model_dir  = round_dir/'model-fixer'
model_path = model_dir/'checkpoint_best.pt'
destdir_root = round_dir/'orig_bad'

n_splits = 5

# Evaluate
pred_dir_root = round_dir/'orig_bad'
pred_dir_prefix = str(pred_dir_root/'fairseq_preprocess__orig_bad.')
pred_fname  = 'model-fixer.pred.txt'

## Cleanup data

Use with caution

### Remove train's preprocessed data

In [62]:
shutil.rmtree(str(fairseq_dir))

### Remove fixer preprocessed data

It's used while running the fixer, it is required to have preprocessed data from bad code

In [9]:
shutil.rmtree(str(destdir_root))

## Reduce data

Since the input data is huge, this step is involved to reduce the size of the input, only keep **1 million record for training**

In [None]:
# Take 1m lines as sample
from itertools import islice

original_data_paired_dir = round_dir/'data_paired'
train_sliced_lines = 1000000
dev_sliced_lines = train_sliced_lines / 100

data_paired_dir.mkdir(exist_ok=True)

# Prepare train.good and train.bad
with open(str(original_data_paired_dir/'train.good'), 'r', encoding='utf-8') as infile, \
    open(str(data_paired_dir/'train.good'), 'w', encoding='utf-8') as outfile:
    for line in islice(infile, train_sliced_lines):
        outfile.write(line)

with open(str(original_data_paired_dir/'train.bad'), 'r', encoding="utf-8") as infile, \
    open(str(data_paired_dir/'train.bad'), 'w', encoding='utf-8') as outfile:
    for line in islice(infile, train_sliced_lines):
        outfile.write(line)

# Prepare dev.good and dev.bad
with open(str(original_data_paired_dir/'dev.good'), 'r', encoding="utf-8") as infile, \
    open(str(data_paired_dir/'dev.good'), 'w', encoding='utf-8') as outfile:
    for line in islice(infile, train_sliced_lines):
        outfile.write(line)

with open(str(original_data_paired_dir/'dev.bad'), 'r', encoding="utf-8") as infile, \
    open(str(data_paired_dir/'dev.bad'), 'w', encoding='utf-8') as outfile:
    for line in islice(infile, train_sliced_lines):
        outfile.write(line)


## Preprocess data for training

In [63]:
# Perform preprocess data
fairseq_preprocess(src='bad', tgt='good', workers=20,
                      destdir  = str(data_paired_dir/'fairseq_preprocess'),
                      trainpref= str(data_paired_dir/'train'),
                      validpref= str(data_paired_dir/'dev'),
                      srcdict  = str(data_dir/'token_vocab.txt') )

FileNotFoundError: [Errno 2] No such file or directory: 'fairseq-preprocess --source-lang bad --destdir data/round_0/small_data_paired/fairseq_preprocess             --joined-dictionary --workers 50 --target-lang good --trainpref data/round_0/small_data_paired/train --validpref data/round_0/small_data_paired/dev --srcdict data/token_vocab.txt --workers 20 '

## Train fixer

In [None]:
# Train
gpu_id = 0
max_epoch = 2

save_dir = round_dir/'model-fixer'
save_dir.mkdir(exist_ok=True)

fairseq_train(gpu_id, str(fairseq_dir), str(save_dir), str(save_dir/'train.log.txt'),
                    src='bad', tgt='good',
                    criterion='label_smoothed_cross_entropy', label_smoothing=0.1,
                    lr=1e-3, warmup_init_lr=1e-4, memory_efficient_fp16=True,
                    encoder_layers=4, decoder_layers=4, encoder_embed_dim=256, decoder_embed_dim=256,
                    encoder_ffn_embed_dim=1024, decoder_ffn_embed_dim=1024,
                    max_tokens=13500, update_freq=2,
                    max_epoch=max_epoch, save_interval_updates=10000, num_workers=4,
                )

CUDA_VISIBLE_DEVICES=0  fairseq-train                 data\round_0\small_data_paired\fairseq_preprocess                --source-lang bad --target-lang good                --arch transformer --share-all-embeddings                --encoder-layers 4 --decoder-layers 4                --encoder-embed-dim 256 --decoder-embed-dim 256                --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024                --encoder-attention-heads 8 --decoder-attention-heads 8                --encoder-normalize-before --decoder-normalize-before                --dropout 0.4 --attention-dropout 0.2 --relu-dropout 0.2                --weight-decay 0.0001                --criterion label_smoothed_cross_entropy                --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1                --lr-scheduler inverse_sqrt --warmup-updates 400 --warmup-init-lr 0.0001                --lr 0.001                --max-tokens 13500                --update-freq 2                --max-epoch 2 --save-inter

## Preprocess data for fixer

In [16]:
# Preprocess input
for split in range(n_splits):
    destdir    = destdir_root/f'fairseq_preprocess__orig_bad.{split}'
    if os.path.exists(str(destdir)):
        continue
    fairseq_preprocess(src='bad', tgt='good', workers=10,
                       destdir  = str(destdir),
                       testpref = str(data_dir/f'orig_bad_code/orig.{split}'),
                       srcdict  = str(data_dir/'token_vocab.txt'),
                       only_source=True )
    shutil.copy(str(data_dir/'token_vocab.txt'), str(destdir/'dict.good.txt'))

## Run fixer

In [8]:
for split in range(n_splits):
    destdir    = destdir_root/f'fairseq_preprocess__orig_bad.{split}'
    pred_path  = destdir/'model-fixer.pred.txt'
    fairseq_generate(str(destdir), str(model_path), str(pred_path),
                         src='bad', tgt='good', gen_subset='test',
                         beam=10, nbest=10, max_len_a=1, max_len_b=50, max_tokens=7000)

2024-07-18 15:37:56 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2024-07-18 15:37:57 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': 

## Evaluate fixer

In [20]:
# all the original bad code is split into 5 chunks for faster processing
for split in range(n_splits):
    eval_one_split(pred_dir_prefix, split, pred_fname, n_workers=10)

get_test_result(pred_dir_prefix, pred_fname)

len(preds) 7528


7528it [00:16, 458.83it/s]


len(preds) 7528


7528it [00:15, 470.70it/s]


len(preds) 7528


7528it [00:15, 472.78it/s]


len(preds) 7528


7528it [00:15, 477.81it/s]


len(preds) 7527


7527it [00:16, 461.42it/s]


split 3
split 4
Total
   acc: 8031 (53.3%) | denom 15055
--------------------------------------------------
Unbalanced (){}[]
   acc: 2871 (71.8%) | denom 3999
Invalid syntax
   acc: 2316 (48.8%) | denom 4749
Indentation error
   acc: 2844 (45.1%) | denom 6307


# Round 1

To Be Updated

# Applying the Model in Practice

In this section, we'll demonstrate how to apply the trained model to real-world code by following these steps:

- Read the code and tokenize it into appropriate units.
- Preprocess the tokenized code using the `fairseq-preprocess` command to prepare it for the model.
- Generate predictions using the `fairseq-generate` command, leveraging the preprocessed code as input.
- Join the generated code tokens back into a complete code string using the `code_toks_to_code_string` function.

In [40]:
# Working directory
working_dir = Path('data-apply')
code_input = str(working_dir/'code-input.txt')
token_input = str(working_dir/'token-input.bad')
token_vocab = str(working_dir/'token-vocab.txt')

## Read and tokenize

In [47]:
with open(code_input, 'r') as file:
    code_content = file.read()

tokens, anonymize_dict = tokenize_python_code(code_content)
print(tokens)

['def', 'read_file_to_string', '(', 'file_path', ',', 'chunk_size', '=', '1024', ')', ':', '<NEWLINE>', '<INDENT>', 'file_contents', '=', '<STRING>', '<NEWLINE>', 'with', 'open', '(', 'file_path', ',', '<STRING>', ',', 'encoding', '=', '<STRING>', ')', 'as', 'file', ':', '<NEWLINE>', '<INDENT>', 'while', 'True', ':', '<NEWLINE>', '<INDENT>', 'chunk', '=', 'file', '.', 'read', '(', 'chunk_size', ')', '<NEWLINE>', 'if', 'not', 'chunk', ':', '<NEWLINE>', '<INDENT>', 'break', '<NEWLINE>', '<DEDENT>', 'file_contents', '+=', 'chunk', '<NEWLINE>', '<DEDENT>', '<DEDENT>', 'return', 'file_contents', '<NEWLINE>', '<NL>', '<NL>', '<COMMENT>', '<NL>', '<DEDENT>', 'file_path', '=', '<STRING>', '<COMMENT>', '<NEWLINE>', 'text', '=', 'read_file_to_string', '(', 'file_path', ')', '<NEWLINE>', 'print', '(', 'text', ')', '<NEWLINE>', '<NL>', '<NL>', '<COMMENT>', '<NL>']


## Save tokenized

In [49]:
with open(token_input, 'w') as file:
    file.write(' '.join(tokens))

## Preprocess tokens

In [82]:
destdir = working_dir/'preprocess'

In [83]:
fairseq_preprocess(src='bad', tgt='good', workers=10,
                   destdir  = str(working_dir/'preprocess'),
                   testpref = str(working_dir/'token-input'),
                   srcdict = str(data_dir/'token_vocab.txt'),
                   only_source=True)
shutil.copy(str(data_dir/'token_vocab.txt'), str(destdir/'dict.good.txt'))

'data-apply/preprocess/dict.good.txt'

## Predict

In [84]:
model_dir = Path('models')

predict_model = model_dir/'round2-BIFI-part2-checkpoint.pt'
predict_path  = destdir/'bifi-model.pred.txt'

fairseq_generate(str(destdir), str(predict_model), str(predict_path),
                 src='bad', tgt='good', gen_subset='test',
                 beam=10, nbest=10, max_len_a=1, max_len_b=50, max_tokens=7000)

2024-07-21 09:56:40 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2024-07-21 09:56:41 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': 

## Parse prediction

In [88]:
preds = parse_fairseq_preds(str(predict_path))
preds[0]

{'src': 'def <unk> ( file_path , chunk_size = 1024 ) : <NEWLINE> <INDENT> file_contents = <STRING> <NEWLINE> with open ( file_path , <STRING> , encoding = <STRING> ) as file : <NEWLINE> <INDENT> while True : <NEWLINE> <INDENT> chunk = file . read ( chunk_size ) <NEWLINE> if not chunk : <NEWLINE> <INDENT> break <NEWLINE> <DEDENT> file_contents += chunk <NEWLINE> <DEDENT> <DEDENT> return file_contents <NEWLINE> <unk> <unk> <unk> <unk> <DEDENT> file_path = <STRING> <unk> <NEWLINE> text = <unk> ( file_path ) <NEWLINE> print ( text ) <NEWLINE> <unk> <unk> <unk> <unk>',
 'pred': ['def <unk> ( file_path , chunk_size = 1024 ) : <NEWLINE> <INDENT> file_contents = <STRING> <NEWLINE> with open ( file_path , <STRING> , encoding = <STRING> ) as file : <NEWLINE> <INDENT> while True : <NEWLINE> <INDENT> chunk = file . read ( chunk_size ) <NEWLINE> if not chunk : <NEWLINE> <INDENT> break <NEWLINE> <DEDENT> file_contents += chunk <NEWLINE> <DEDENT> <DEDENT> return file_contents <NEWLINE> <unk> , <unk> 

# Draft

fairseq-preprocess --source-lang bad --destdir data-apply/preprocess             --joined-dictionary --workers 50 --target-lang good --testpref data-apply/token-input --srcdict data/token_vocab.txt --workers 10 --only-source

In [24]:
with open(os.path.join('data-apply', 'code-input.txt'), 'r') as file:
    code_content = file.read()

code_input = tokenize_python_code(code_content)
print(code_input[0])

['def', 'read_file_to_string', '(', 'file_path', ',', 'chunk_size', '=', '1024', ')', ':', '<NEWLINE>', '<INDENT>', 'file_contents', '=', '<STRING>', '<NEWLINE>', 'with', 'open', '(', 'file_path', ',', '<STRING>', ',', 'encoding', '=', '<STRING>', ')', 'as', 'file', ':', '<NEWLINE>', '<INDENT>', 'while', 'True', ':', '<NEWLINE>', '<INDENT>', 'chunk', '=', 'file', '.', 'read', '(', 'chunk_size', ')', '<NEWLINE>', 'if', 'not', 'chunk', ':', '<NEWLINE>', '<INDENT>', 'break', '<NEWLINE>', '<DEDENT>', 'file_contents', '+=', 'chunk', '<NEWLINE>', '<DEDENT>', '<DEDENT>', 'return', 'file_contents', '<NEWLINE>', '<NL>', '<NL>', '<COMMENT>', '<NL>', '<DEDENT>', 'file_path', '=', '<STRING>', '<COMMENT>', '<NEWLINE>', 'text', '=', 'read_file_to_string', '(', 'file_path', ')', '<NEWLINE>', 'print', '(', 'text', ')', '<NEWLINE>', '<NL>', '<NL>', '<COMMENT>', '<NL>']


In [25]:
code_output = code_toks_to_code_string(' '.join(code_input[0]), code_input[1])

In [27]:
with open(os.path.join('data-apply', 'code-output.txt'), 'w') as file:  # Open in write mode ('w')
    file.write(code_output)
