# Prepare Environment

We should check if we're on the Colab and do additional setup
- Install `fairseq`, `tqdm`

In [9]:
from IPython.core import getipython

is_colab = 'google.colab' in str(getipython.get_ipython())

if is_colab:
  !pip install fairseq tqdm
  print("Fairseq installation successful (if no errors occurred)")
else:
  print("Notebook is not on Colab. Fairseq installation not attempted.")


Notebook is not on Colab. Fairseq installation not attempted.


In [3]:
# Check PyTorch version
import torch
print('Torch', torch.__version__)

import fairseq
print('fairseq', fairseq.__version__)

Torch 2.3.1
fairseq 0.12.2


# Data Preparation

In [10]:
DATA_DIR = 'drive/MyDrive/Dataset/bifi-dataset' if is_colab else 'data'

## Data reduce

Since the original dataset is huge, training faced lots of difficulty, so we'll reduce the size, just take a small subset for training / validation

# Supported functions

In [None]:
import shlex
import subprocess
import sys
import shutil

def run_and_stream(cmd):
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    for line in iter(proc.stdout.readline, ''):
        print(line, end='')  # Print without newline for streaming effect
        sys.stdout.flush()  # Ensure immediate display
    proc.stdout.close()

## fairseq_preprocess

In [56]:
def fairseq_preprocess(src, tgt, destdir, trainpref=None, validpref=None, testpref=None, srcdict=None, **kwargs):
    additional_cmds = ''.join([f"--{k.replace('_', '-')} {v} " for k, v in kwargs.items() if not isinstance(v, bool)])
    additional_cmds += ''.join([f"--{k.replace('_', '-')} " for k, v in kwargs.items() if isinstance(v, bool) and v])
    cmd = f'fairseq-preprocess --source-lang {src} --destdir {destdir} \
            --joined-dictionary --workers 50 --no-progress-bar --log-interval 20 '
    if tgt is not None:
        cmd += f'--target-lang {tgt} '
    if trainpref is not None:
        cmd += f'--trainpref {trainpref} '
    if validpref is not None:
        cmd += f'--validpref {validpref} '
    if testpref is not None:
        cmd += f'--testpref {testpref} '
    if srcdict is not None:
        cmd += f'--srcdict {srcdict} '
    cmd += additional_cmds

    # Execute command line command
    print(cmd)
    !{cmd}

## fairseq_train

In [58]:
def fairseq_train(GPUs, preprocess_dir, save_dir, logfile, src, tgt, model='transformer',
                  criterion='label_smoothed_cross_entropy',
                  encoder_layers=4, decoder_layers=4, encoder_embed_dim=256,
                  decoder_embed_dim=256, encoder_ffn_embed_dim=1024,
                  decoder_ffn_embed_dim=1024, encoder_attention_heads=8,
                  decoder_attention_heads=8, dropout=0.4,
                  attention_dropout=0.2, relu_dropout=0.2,
                  weight_decay=0.0001, warmup_updates=400, warmup_init_lr=1e-4,
                  lr=1e-3, max_tokens=1000, update_freq=4,
                  max_epoch=10, save_interval=1, log_interval=100, log_format='tqdm',
                  user_dir=None, reset=False, restore_file=None, **kwargs):
    if True:
        additional_cmds = ''.join(
            [f"--{k.replace('_', '-')} {v} " for k, v in kwargs.items() if not isinstance(v, bool)])
        additional_cmds += ''.join(
            [f"--{k.replace('_', '-')} " for k, v in kwargs.items() if isinstance(v, bool) and v])
        cmd = f"fairseq-train \
                {preprocess_dir} \
               --source-lang {src} --target-lang {tgt} \
               --arch {model} --share-all-embeddings \
               --encoder-layers {encoder_layers} --decoder-layers {decoder_layers} \
               --encoder-embed-dim {encoder_embed_dim} --decoder-embed-dim {decoder_embed_dim} \
               --encoder-ffn-embed-dim {encoder_ffn_embed_dim} --decoder-ffn-embed-dim {decoder_ffn_embed_dim} \
               --encoder-attention-heads {encoder_attention_heads} --decoder-attention-heads {decoder_attention_heads} \
               --encoder-normalize-before --decoder-normalize-before \
               --dropout {dropout} --attention-dropout {attention_dropout} --relu-dropout {relu_dropout} \
               --weight-decay {weight_decay} \
               --criterion {criterion} \
               --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1 \
               --lr-scheduler inverse_sqrt --warmup-updates {warmup_updates} --warmup-init-lr {warmup_init_lr} \
               --lr {lr} \
               --max-tokens {max_tokens} \
               --update-freq {update_freq} \
               --max-epoch {max_epoch} --save-interval {save_interval} --save-dir {save_dir} "
        if user_dir is not None:
            cmd += f'--user-dir {user_dir} '
        if restore_file is not None:
            cmd += f"--restore-file {restore_file} "
        if reset:
            cmd += "--reset-optimizer \
                   --reset-lr-scheduler \
                   --reset-dataloader \
                   --reset-meters "
        cmd += additional_cmds
        if logfile is not None:
            import socket, os
            with open(logfile, 'w') as outf:
                print(socket.gethostname(), file=outf)
                print("pid:", os.getpid(), file=outf)
                print("screen: %s" % subprocess.check_output('echo $STY', shell=True).decode('utf'), file=outf)
                outf.flush()
            cmd += f"  2>&1 | tee -a {logfile} "
        if GPUs is not None:
            cmd = 'CUDA_VISIBLE_DEVICES={}  {}'.format(GPUs, cmd)

        print(cmd)
        # !{cmd}

# Round 0

In [44]:
from pathlib import Path

data_dir = Path(DATA_DIR)
round_dir = data_dir/'round_0'
data_paired_dir = round_dir/'small_data_paired'
fairseq_dir = data_paired_dir/'fairseq_preprocess'

In [42]:
# Remove fairseq preprocessed dir
shutil.rmtree(str(fairseq_dir))

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'data\\round_0\\small_data_paired\\fairseq_preprocess'

In [20]:
# Take 1m lines as sample
from itertools import islice

original_data_paired_dir = round_dir/'data_paired'
train_sliced_lines = 1000000
dev_sliced_lines = train_sliced_lines / 100

data_paired_dir.mkdir(exist_ok=True)

# Prepare train.good and train.bad
with open(str(original_data_paired_dir/'train.good'), 'r', encoding='utf-8') as infile, \
    open(str(data_paired_dir/'train.good'), 'w', encoding='utf-8') as outfile:
    for line in islice(infile, train_sliced_lines):
        outfile.write(line)

with open(str(original_data_paired_dir/'train.bad'), 'r', encoding="utf-8") as infile, \
    open(str(data_paired_dir/'train.bad'), 'w', encoding='utf-8') as outfile:
    for line in islice(infile, train_sliced_lines):
        outfile.write(line)

# Prepare dev.good and dev.bad
with open(str(original_data_paired_dir/'dev.good'), 'r', encoding="utf-8") as infile, \
    open(str(data_paired_dir/'dev.good'), 'w', encoding='utf-8') as outfile:
    for line in islice(infile, train_sliced_lines):
        outfile.write(line)

with open(str(original_data_paired_dir/'dev.bad'), 'r', encoding="utf-8") as infile, \
    open(str(data_paired_dir/'dev.bad'), 'w', encoding='utf-8') as outfile:
    for line in islice(infile, train_sliced_lines):
        outfile.write(line)


## Preprocess

In [57]:
fairseq_preprocess(src='bad', tgt='good', workers=20,
                      destdir  = str(data_paired_dir/'fairseq_preprocess'),
                      trainpref= str(data_paired_dir/'train'),
                      validpref= str(data_paired_dir/'dev'),
                      srcdict  = str(data_dir/'token_vocab.txt') )

fairseq-preprocess --source-lang bad --destdir data\round_0\small_data_paired\fairseq_preprocess             --joined-dictionary --workers 50 --no-progress-bar --log-interval 20 --target-lang good --trainpref data\round_0\small_data_paired\train --validpref data\round_0\small_data_paired\dev --srcdict data\token_vocab.txt --workers 20 


KeyboardInterrupt: 

## Train

In [59]:
# Train
# --gpu_id 0 --max_epoch 2
gpu_id = 0
max_epoch = 2

save_dir = round_dir/'model-fixer'
save_dir.mkdir(exist_ok=True)

fairseq_train(gpu_id, str(fairseq_dir), str(save_dir), str(save_dir/'train.log.txt'),
                    src='bad', tgt='good',
                    criterion='label_smoothed_cross_entropy', label_smoothing=0.1,
                    lr=1e-3, warmup_init_lr=1e-4, memory_efficient_fp16=True,
                    encoder_layers=4, decoder_layers=4, encoder_embed_dim=256, decoder_embed_dim=256,
                    encoder_ffn_embed_dim=1024, decoder_ffn_embed_dim=1024,
                    max_tokens=13500, update_freq=2,
                    max_epoch=max_epoch, save_interval_updates=10000, num_workers=4,
                )

CUDA_VISIBLE_DEVICES=0  fairseq-train                 data\round_0\small_data_paired\fairseq_preprocess                --source-lang bad --target-lang good                --arch transformer --share-all-embeddings                --encoder-layers 4 --decoder-layers 4                --encoder-embed-dim 256 --decoder-embed-dim 256                --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024                --encoder-attention-heads 8 --decoder-attention-heads 8                --encoder-normalize-before --decoder-normalize-before                --dropout 0.4 --attention-dropout 0.2 --relu-dropout 0.2                --weight-decay 0.0001                --criterion label_smoothed_cross_entropy                --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1                --lr-scheduler inverse_sqrt --warmup-updates 400 --warmup-init-lr 0.0001                --lr 0.001                --max-tokens 13500                --update-freq 2                --max-epoch 2 --save-inter

# Draft

In [26]:
import subprocess
import sys

process = subprocess.Popen(["fairseq-preprocess", "--help"], stdout=subprocess.PIPE, universal_newlines=True)
for line in process.stdout:
  print(line, end='')  # Print without newline to avoid extra line breaks
  sys.stdout.flush()  # Flush output buffer to display immediately

# Wait for the process to finish (optional)
process.wait()


usage: fairseq-preprocess [-h] [--no-progress-bar]
                          [--log-interval LOG_INTERVAL]
                          [--log-format {json,none,simple,tqdm}]
                          [--log-file LOG_FILE] [--aim-repo AIM_REPO]
                          [--aim-run-hash AIM_RUN_HASH]
                          [--tensorboard-logdir TENSORBOARD_LOGDIR]
                          [--wandb-project WANDB_PROJECT] [--azureml-logging]
                          [--seed SEED] [--cpu] [--tpu] [--bf16]
                          [--memory-efficient-bf16] [--fp16]
                          [--memory-efficient-fp16] [--fp16-no-flatten-grads]
                          [--fp16-init-scale FP16_INIT_SCALE]
                          [--fp16-scale-window FP16_SCALE_WINDOW]
                          [--fp16-scale-tolerance FP16_SCALE_TOLERANCE]
                          [--on-cpu-convert-precision]
                          [--min-loss-scale MIN_LOSS_SCALE]
                          [--threshol

0

In [28]:
def hello_exe():
    cmd = 'ping 8.8.8.8'
    !{cmd}

In [29]:
hello_exe()


Pinging 8.8.8.8 with 32 bytes of data:
Reply from 8.8.8.8: bytes=32 time=43ms TTL=114
Reply from 8.8.8.8: bytes=32 time=38ms TTL=114
Reply from 8.8.8.8: bytes=32 time=39ms TTL=114
Reply from 8.8.8.8: bytes=32 time=36ms TTL=114

Ping statistics for 8.8.8.8:
    Packets: Sent = 4, Received = 4, Lost = 0 (0% loss),
Approximate round trip times in milli-seconds:
    Minimum = 36ms, Maximum = 43ms, Average = 39ms


In [31]:
from wurlitzer import sys_pipes

with sys_pipes():
    !ping 8.8.8.8


ModuleNotFoundError: No module named 'fcntl'

In [33]:
%load_ext wurlitzer

!echo 'Hello'

ModuleNotFoundError: No module named 'fcntl'

In [55]:
import subprocess
import sys

def run_and_stream(cmd):
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    for line in iter(proc.stdout.readline, ''):
        print(line, end='')  # Print without newline for streaming effect
        sys.stdout.flush()  # Ensure immediate display
    proc.stdout.close()

run_and_stream("ping 8.8.8.8")


Pinging 8.8.8.8 with 32 bytes of data:
Reply from 8.8.8.8: bytes=32 time=37ms TTL=114
Reply from 8.8.8.8: bytes=32 time=38ms TTL=114
Reply from 8.8.8.8: bytes=32 time=38ms TTL=114
Reply from 8.8.8.8: bytes=32 time=38ms TTL=114

Ping statistics for 8.8.8.8:
    Packets: Sent = 4, Received = 4, Lost = 0 (0% loss),
Approximate round trip times in milli-seconds:
    Minimum = 37ms, Maximum = 38ms, Average = 37ms


In [53]:
run_and_stream("fairseq-train -h")

CMD ['fairseq-train', '-h']
usage: fairseq-train [-h] [--no-progress-bar] [--log-interval LOG_INTERVAL]
                     [--log-format {json,none,simple,tqdm}]
                     [--log-file LOG_FILE] [--aim-repo AIM_REPO]
                     [--aim-run-hash AIM_RUN_HASH]
                     [--tensorboard-logdir TENSORBOARD_LOGDIR]
                     [--wandb-project WANDB_PROJECT] [--azureml-logging]
                     [--seed SEED] [--cpu] [--tpu] [--bf16]
                     [--memory-efficient-bf16] [--fp16]
                     [--memory-efficient-fp16] [--fp16-no-flatten-grads]
                     [--fp16-init-scale FP16_INIT_SCALE]
                     [--fp16-scale-window FP16_SCALE_WINDOW]
                     [--fp16-scale-tolerance FP16_SCALE_TOLERANCE]
                     [--on-cpu-convert-precision]
                     [--min-loss-scale MIN_LOSS_SCALE]
                     [--threshold-loss-scale THRESHOLD_LOSS_SCALE] [--amp]
                     [--amp-ba

In [52]:
%run ping 8.8.8.8

Exception: File `'ping'` not found.

In [None]:
from subprocess import Popen, PIPE, STDOUT

from IPython.core.magic import register_line_magic


@register_line_magic
def runrealcmd(command):
    process = Popen(command, stdout=PIPE, shell=True, stderr=STDOUT, bufsize=1, close_fds=True)
    for line in iter(process.stdout.readline, b''):
        print(line.rstrip().decode('utf-8'))
    process.stdout.close()
    process.wait()