# Prepare Environment

We should check if we're on the Colab and do additional setup
- Install `fairseq`, `tqdm`

In [2]:
from IPython.core import getipython

is_colab = 'google.colab' in str(getipython.get_ipython())

if is_colab:
  !pip install fairseq tqdm
  print("Fairseq installation successful (if no errors occurred)")
else:
  print("Notebook is not on Colab. Fairseq installation not attempted.")


Notebook is not on Colab. Fairseq installation not attempted.


In [1]:
# Check PyTorch version
import torch
print(torch.__version__)

2.3.1


# Data Preparation

In [6]:
DATA_DIR = 'drive/MyDrive/Dataset/bifi-dataset' if is_colab else 'data'

# Supported functions

In [7]:
import shlex
import subprocess
import sys

## fairseq_preprocess

In [5]:
def fairseq_preprocess(src, tgt, destdir, trainpref=None, validpref=None, testpref=None, srcdict=None, **kwargs):
    additional_cmds = ''.join([f"--{k.replace('_', '-')} {v} " for k, v in kwargs.items() if not isinstance(v, bool)])
    additional_cmds += ''.join([f"--{k.replace('_', '-')} " for k, v in kwargs.items() if isinstance(v, bool) and v])
    cmd = f'fairseq-preprocess --source-lang {src} --destdir {destdir} \
            --joined-dictionary --workers 50 --no-progress-bar --log-interval 20 '
    if tgt is not None:
        cmd += f'--target-lang {tgt} '
    if trainpref is not None:
        cmd += f'--trainpref {trainpref} '
    if validpref is not None:
        cmd += f'--validpref {validpref} '
    if testpref is not None:
        cmd += f'--testpref {testpref} '
    if srcdict is not None:
        cmd += f'--srcdict {srcdict} '
    cmd += additional_cmds

    # Execute command line command
    print(cmd)
    !{cmd}

## fairseq_train

In [13]:
def fairseq_train(GPUs, preprocess_dir, save_dir, logfile, src, tgt, model='transformer',
                  criterion='label_smoothed_cross_entropy',
                  encoder_layers=4, decoder_layers=4, encoder_embed_dim=256,
                  decoder_embed_dim=256, encoder_ffn_embed_dim=1024,
                  decoder_ffn_embed_dim=1024, encoder_attention_heads=8,
                  decoder_attention_heads=8, dropout=0.4,
                  attention_dropout=0.2, relu_dropout=0.2,
                  weight_decay=0.0001, warmup_updates=400, warmup_init_lr=1e-4,
                  lr=1e-3, min_lr=1e-9, max_tokens=1000, update_freq=4,
                  max_epoch=10, save_interval=1, log_interval=100, log_format='tqdm',
                  user_dir=None, reset=False, restore_file=None, **kwargs):
    if True:
        additional_cmds = ''.join(
            [f"--{k.replace('_', '-')} {v} " for k, v in kwargs.items() if not isinstance(v, bool)])
        additional_cmds += ''.join(
            [f"--{k.replace('_', '-')} " for k, v in kwargs.items() if isinstance(v, bool) and v])
        cmd = f"fairseq-train \
                {preprocess_dir} \
               --source-lang {src} --target-lang {tgt} \
               --arch {model} --share-all-embeddings \
               --encoder-layers {encoder_layers} --decoder-layers {decoder_layers} \
               --encoder-embed-dim {encoder_embed_dim} --decoder-embed-dim {decoder_embed_dim} \
               --encoder-ffn-embed-dim {encoder_ffn_embed_dim} --decoder-ffn-embed-dim {decoder_ffn_embed_dim} \
               --encoder-attention-heads {encoder_attention_heads} --decoder-attention-heads {decoder_attention_heads} \
               --encoder-normalize-before --decoder-normalize-before \
               --dropout {dropout} --attention-dropout {attention_dropout} --relu-dropout {relu_dropout} \
               --weight-decay {weight_decay} \
               --criterion {criterion} \
               --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1 \
               --lr-scheduler inverse_sqrt --warmup-updates {warmup_updates} --warmup-init-lr {warmup_init_lr} \
               --lr {lr} --min-lr {min_lr} \
               --max-tokens {max_tokens} \
               --update-freq {update_freq} \
               --max-epoch {max_epoch} --save-interval {save_interval} --save-dir {save_dir} "
        if user_dir is not None:
            cmd += f'--user-dir {user_dir} '
        if restore_file is not None:
            cmd += f"--restore-file {restore_file} "
        if reset:
            cmd += "--reset-optimizer \
                   --reset-lr-scheduler \
                   --reset-dataloader \
                   --reset-meters "
        cmd += additional_cmds
        if logfile is not None:
            import socket, os
            with open(logfile, 'w') as outf:
                print(socket.gethostname(), file=outf)
                print("pid:", os.getpid(), file=outf)
                print("screen: %s" % subprocess.check_output('echo $STY', shell=True).decode('utf'), file=outf)
                outf.flush()
            cmd += f"  2>&1 | tee -a {logfile} "
        if GPUs is not None:
            cmd = 'CUDA_VISIBLE_DEVICES={}  {}'.format(GPUs, cmd)

        print(cmd)
        #!{cmd}cmd

# Round 0

In [4]:
from pathlib import Path

data_dir = Path(DATA_DIR)
round_dir = data_dir/'round0'
data_paired_dir = round_dir/'data_paired'
fairseq_dir = data_paired_dir/'fairseq_preprocess'

NameError: name 'DATA_DIR' is not defined

## Preprocess

In [None]:
# Remove fairseq dir
!rm -r {str(fairseq_dir)}

In [12]:
fairseq_preprocess(src='bad', tgt='good', workers=20,
                      destdir  = str(data_paired_dir/'fairseq_preprocess'),
                      trainpref= str(data_paired_dir/'train'),
                      validpref= str(data_paired_dir/'dev'),
                      srcdict  = str(data_dir/'token_vocab.txt') )

fairseq-preprocess --source-lang bad --destdir data\round0\data_paired\fairseq_preprocess             --joined-dictionary --workers 50 --no-progress-bar --log-interval 20 --target-lang good --trainpref data\round0\data_paired\train --validpref data\round0\data_paired\dev --srcdict data\token_vocab.txt --workers 20 
^C


## Train

In [14]:
# Train
# --gpu_id 0 --max_epoch 2
gpu_id = 0
max_epoch = 2

save_dir = round_dir/'model-fixer'
save_dir.mkdir(exist_ok=True)

fairseq_train(gpu_id, str(fairseq_dir), str(save_dir), str(save_dir/'train.log.txt'),
                    src='bad', tgt='good',
                    criterion='label_smoothed_cross_entropy', label_smoothing=0.1,
                    lr=1e-3, warmup_init_lr=1e-4, memory_efficient_fp16=True,
                    encoder_layers=4, decoder_layers=4, encoder_embed_dim=256, decoder_embed_dim=256,
                    encoder_ffn_embed_dim=1024, decoder_ffn_embed_dim=1024,
                    max_tokens=13500, update_freq=2,
                    max_epoch=max_epoch, save_interval_updates=10000, num_workers=4,
                )

CUDA_VISIBLE_DEVICES=0  fairseq-train                 data\round0\data_paired\fairseq_preprocess                --source-lang bad --target-lang good                --arch transformer --share-all-embeddings                --encoder-layers 4 --decoder-layers 4                --encoder-embed-dim 256 --decoder-embed-dim 256                --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024                --encoder-attention-heads 8 --decoder-attention-heads 8                --encoder-normalize-before --decoder-normalize-before                --dropout 0.4 --attention-dropout 0.2 --relu-dropout 0.2                --weight-decay 0.0001                --criterion label_smoothed_cross_entropy                --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1                --lr-scheduler inverse_sqrt --warmup-updates 400 --warmup-init-lr 0.0001                --lr 0.001 --min-lr 1e-09                --max-tokens 13500                --update-freq 2                --max-epoch 2 --sa

# Draft

In [None]:
import subprocess
import sys

process = subprocess.Popen(["fairseq-preprocess", "--help"], stdout=subprocess.PIPE, universal_newlines=True)
for line in process.stdout:
  print(line, end='')  # Print without newline to avoid extra line breaks
  sys.stdout.flush()  # Flush output buffer to display immediately

# Wait for the process to finish (optional)
process.wait()


usage: fairseq-preprocess [-h] [--no-progress-bar] [--log-interval LOG_INTERVAL]
                          [--log-format {json,none,simple,tqdm}] [--log-file LOG_FILE]
                          [--aim-repo AIM_REPO] [--aim-run-hash AIM_RUN_HASH]
                          [--tensorboard-logdir TENSORBOARD_LOGDIR]
                          [--wandb-project WANDB_PROJECT] [--azureml-logging] [--seed SEED]
                          [--cpu] [--tpu] [--bf16] [--memory-efficient-bf16] [--fp16]
                          [--memory-efficient-fp16] [--fp16-no-flatten-grads]
                          [--fp16-init-scale FP16_INIT_SCALE]
                          [--fp16-scale-window FP16_SCALE_WINDOW]
                          [--fp16-scale-tolerance FP16_SCALE_TOLERANCE]
                          [--on-cpu-convert-precision] [--min-loss-scale MIN_LOSS_SCALE]
                          [--threshold-loss-scale THRESHOLD_LOSS_SCALE] [--amp]
                          [--amp-batch-retries AMP_BATCH_RETR

0

In [None]:
!fairseq-preprocess --source-lang bad --destdir drive/MyDrive/Dataset/bifi-dataset/round_0/data_paired/fairseq_preprocess             --joined-dictionary --workers 50 --no-progress-bar --log-interval 20 --target-lang good --trainpref drive/MyDrive/Dataset/bifi-dataset/round_0/data_paired/train --validpref drive/MyDrive/Dataset/bifi-dataset/round_0/data_paired/dev --srcdict drive/MyDrive/Dataset/bifi-dataset/token_vocab.txt --workers 20


2024-07-09 06:24:28.857910: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-09 06:24:28.857974: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-09 06:24:28.859707: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-09 06:24:28.868270: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-09 06:24:33 | INFO | fairseq.tasks.text_to_sp

In [None]:
!fairseq-preprocess --source-lang bad --destdir drive/MyDrive/Dataset/bifi-dataset/round_0/data_paired/fairseq_preprocess             --joined-dictionary --workers 50 --no-progress-bar --log-interval 20 --target-lang good --trainpref drive/MyDrive/Dataset/bifi-dataset/round_0/data_paired/train --validpref drive/MyDrive/Dataset/bifi-dataset/round_0/data_paired/dev --srcdict drive/MyDrive/Dataset/bifi-dataset/token_vocab.txt --workers 20


2024-07-09 06:24:54.755732: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-09 06:24:54.755815: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-09 06:24:54.757430: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-09 06:24:54.765945: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-09 06:24:59 | INFO | fairseq.tasks.text_to_sp

In [None]:
!CUDA_VISIBLE_DEVICES=0  fairseq-train                 drive/MyDrive/Dataset/bifi-dataset/round_0/data_paired/fairseq_preprocess                --source-lang bad --target-lang good                --arch transformer --share-all-embeddings                --encoder-layers 4 --decoder-layers 4                --encoder-embed-dim 256 --decoder-embed-dim 256                --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024                --encoder-attention-heads 8 --decoder-attention-heads 8                --encoder-normalize-before --decoder-normalize-before                --dropout 0.4 --attention-dropout 0.2 --relu-dropout 0.2                --weight-decay 0.0001                --criterion label_smoothed_cross_entropy                --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1                --lr-scheduler inverse_sqrt --warmup-updates 400 --warmup-init-lr 0.0001                --lr 0.001 --min-lr 1e-09                --max-tokens 13500                --update-freq 2                --max-epoch 2 --save-interval 1 --save-dir drive/MyDrive/Dataset/bifi-dataset/round_0/model-fixer --label-smoothing 0.1 --save-interval-updates 10000 --num-workers 4 --memory-efficient-fp16   2>&1 | tee -a drive/MyDrive/Dataset/bifi-dataset/round_0/model-fixer/train.log.txt

2024-07-09 09:00:37.263366: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-09 09:00:37.263428: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-09 09:00:37.265663: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-09 09:00:37.274412: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-09 09:00:40 | INFO | numexpr.utils | NumExpr 

In [None]:
!CUDA_VISIBLE_DEVICES=0 fairseq-train \
  ./drive/MyDrive/Dataset/bifi-dataset/round_0/data_paired/fairseq_preprocess \
  --source-lang bad --target-lang good \
  --arch transformer --share-all-embeddings \
  --encoder-layers 4 --decoder-layers 4 \
  --encoder-embed-dim 256 --decoder-embed-dim 256 \
  --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024 \
  --encoder-attention-heads 8 --decoder-attention-heads 8 \
  --encoder-normalize-before --decoder-normalize-before \
  --dropout 0.4 --attention-dropout 0.2 --relu-dropout 0.2 \
  --weight-decay 0.0001 \
  --criterion label_smoothed_cross_entropy \
  --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1 \
  --lr-scheduler inverse_sqrt --warmup-updates 400 --warmup-init-lr 0.0001 \
  --lr 0.001 \
  --max-tokens 13500 \
  --update-freq 2 \
  --max-epoch 2 --save-interval 1 --save-dir drive/MyDrive/Dataset/bifi-dataset/round_0/model-fixer \
  --label-smoothing 0.1 --save-interval-updates 10000 --num-workers 4 --memory-efficient-fp16 \
  2>&1 | tee -a drive/MyDrive/Dataset/bifi-dataset/round_0/model-fixer/train.log.txt

2024-07-09 11:10:48.377172: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-09 11:10:48.377247: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-09 11:10:48.379746: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-09 11:10:48.389850: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-09 11:10:51 | INFO | numexpr.utils | NumExpr 

In [3]:
!fairseq-train -h

usage: fairseq-train [-h] [--no-progress-bar] [--log-interval LOG_INTERVAL]
                     [--log-format {json,none,simple,tqdm}]
                     [--log-file LOG_FILE] [--aim-repo AIM_REPO]
                     [--aim-run-hash AIM_RUN_HASH]
                     [--tensorboard-logdir TENSORBOARD_LOGDIR]
                     [--wandb-project WANDB_PROJECT] [--azureml-logging]
                     [--seed SEED] [--cpu] [--tpu] [--bf16]
                     [--memory-efficient-bf16] [--fp16]
                     [--memory-efficient-fp16] [--fp16-no-flatten-grads]
                     [--fp16-init-scale FP16_INIT_SCALE]
                     [--fp16-scale-window FP16_SCALE_WINDOW]
                     [--fp16-scale-tolerance FP16_SCALE_TOLERANCE]
                     [--on-cpu-convert-precision]
                     [--min-loss-scale MIN_LOSS_SCALE]
                     [--threshold-loss-scale THRESHOLD_LOSS_SCALE] [--amp]
                     [--amp-batch-retries AMP_BATCH_RETRIE

In [None]:
!fairseq-train \
    data/round0/data_paired/fairseq_preprocess \
    --source-lang bad --target-lang good \
                                --arch transformer --share-all-embeddings \
                                       --encoder-layers 4 --decoder-layers 4 \
                                                                           --encoder-embed-dim 256 --decoder-embed-dim 256 \
                                                                                                                       --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024 \
                                                                                                                                                                            --encoder-attention-heads 8 --decoder-attention-heads 8 \
                                                                                                                                                                                                                                  --encoder-normalize-before --decoder-normalize-before \
                                                                                                                                                                                                                                  --dropout 0.4 --attention-dropout 0.2 --relu-dropout 0.2 \
                                                                                                                                                                                                                                                                                       --weight-decay 0.0001 \
                                                                                                                                                                                                                                                                                                      --criterion label_smoothed_cross_entropy \
                                                                                                                                                                                                                                                                                                                  --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1 \
                                                                                                                                                                                                                                                                                                                                                                          --lr-scheduler inverse_sqrt --warmup-updates 400 --warmup-init-lr 0.0001 \
                                                                                                                                                                                                                                                                                                                                                                                                                                            --lr 0.001 \
                                                                                                                                                                                                                                                                                                                                                                                                                                                 --max-tokens 13500 \
                                                                                                                                                                                                                                                                                                                                                                                                                                                              --update-freq 2 \
                                                                                                                                                                                                                                                                                                                                                                                                                                                                            --max-epoch 2 --save-interval 1 --save-dir data/round0/model-fixer \
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       --label-smoothing 0.1 --save-interval-updates 10000 \
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     --num-workers 4 --memory-efficient-fp16 \
    2>&1 | tee -a data/round0/model-fixer/train.log.txt 