# Fr-En Translation
Generate translations on MuST-C dataset with WMT14 Fr-En Transformer model

In [1]:
import torch

# check if we can connect to the GPU with PyTorch
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print('Current device:', torch.cuda.get_device_name(device))
else:
    print('Failed to find GPU. Will use CPU.')
    device = 'cpu'

Current device: GeForce GTX 1080 Ti


In [2]:
PATH="/export/data4/vzhekova/biases-data/Fr-En_MuST-C"

%cd $PATH

/export/data4/vzhekova/biases-data/Fr-En_MuST-C


# Data preprocessing

- Binarize data

In [8]:
# Binarize the data

# map words appearing less than threshold times to unknown 
# reuse model dict
!fairseq-preprocess \
    --source-lang fr \
    --target-lang en \
    --trainpref sw.train.fr-en \
    --validpref sw.dev.fr-en \
    --testpref sw.tst.fr-en \
    --srcdict wmt14_fr_en/dict.fr.txt \
    --tgtdict wmt14_fr_en/dict.en.txt \
    --destdir data-bin \
    --thresholdtgt 0 \
    --thresholdsrc 0 \
    --workers 8

2023-03-10 15:37:31 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=False, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='fr', srcdict='wmt14_fr_

# Translation

In [9]:
%cd $PATH

/export/data4/vzhekova/biases-data/Fr-En_MuST-C


- Beam search

In [10]:
# Generate translations
!fairseq-generate data-bin  \
    --task translation \
    --source-lang fr \
    --target-lang en \
    --path wmt14_fr_en/model.pt \
    --beam 5 \
    --batch-size 256 \
    --memory-efficient-fp16 \
    --remove-bpe=subword_nmt > fr-en.decode.log

2023-03-10 15:40:25 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

# Evaluation

In [11]:
# Extract the hypotheses and references from the decoding log file
!grep ^H fr-en.decode.log | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp.txt
!grep ^T fr-en.decode.log | sed 's/^T-//g' | cut -f 2 | sed 's/ @@//g' > ./ref.txt

!head ./hyp.txt
print("..........")
!head ./ref.txt

LEGO has made it accessible .
He will withdraw in silence .
I am constantly getting back on track .
The country is South Korea .
This does not limit our burden .
Why chose medicine ? &quot;
This does not undermine our struggle .
I sometimes go too strong .
It is getting worse .
So Scott takes this appeal .
..........
<<unk>> made it accessible .
He <<unk>> into silence .
He kept pointing here .
One is South Korea .
It &apos;s not our burden .
Why did I go into medicine ?
It &apos;s not our struggle .
I sometimes push too hard .
It &apos;s spreading virally .
So Scott gets this call .


In [12]:
# Detokenize text        

md_en = MosesDetokenizer(lang='en')
md_fr = MosesDetokenizer(lang='fr')

with open('hyp.txt', encoding='utf8') as fin, open('hyp_detok.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('ref.txt', encoding='utf8') as fin, open('ref_detok.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_fr.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


In [13]:
!head ./hyp_detok.txt
print("..........")
!head ./ref_detok.txt

LEGO has made it accessible.
He will withdraw in silence.
I am constantly getting back on track.
The country is South Korea.
This does not limit our burden.
Why chose medicine? "
This does not undermine our struggle.
I sometimes go too strong.
It is getting worse.
So Scott takes this appeal.
..........
<<unk>> made it accessible.
He <<unk>> into silence.
He kept pointing here.
One is South Korea.
It 's not our burden.
Why did I go into medicine ?
It 's not our struggle.
I sometimes push too hard.
It 's spreading virally.
So Scott gets this call.


In [14]:
# Evaluate the model
# BLEU score of 19.9 (beam=5)
!cat ./hyp_detok.txt | sacrebleu ./ref_detok.txt

{
 "name": "BLEU",
 "score": 19.9,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.3.1",
 "verbose_score": "60.7/33.3/20.3/12.7 (BP = 0.738 ratio = 0.767 hyp_len = 50992 ref_len = 66451)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.3.1"
}
[0m

# Finetuning WMT14 En-Fr model on MuST-C

In [15]:
!CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin \
    --arch transformer --share-decoder-input-output-embed \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --keep-last-epochs 2 \
    --max-tokens 4096 \
    --max-epoch 5 \
    --finetune-from-model wmt14_fr_en/model.pt	

2023-03-10 15:44:18 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': N

- Translation

In [16]:
%cd $PATH

/export/data4/vzhekova/biases-data/Fr-En_MuST-C


In [18]:
# Generate translations
!fairseq-generate data-bin  \
    --task translation \
    --source-lang fr \
    --target-lang en \
    --path checkpoints/checkpoint_best.pt \
    --beam 5 \
    --batch-size 256 \
    --memory-efficient-fp16 \
    --remove-bpe=subword_nmt > fr-en.decode_finetuned.log

2023-03-11 11:27:51 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

- Evaluation

In [19]:
# Extract the hypotheses and references from the decoding log file
!grep ^H fr-en.decode_finetuned.log | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_finetuned.txt
!grep ^T fr-en.decode_finetuned.log | sed 's/^T-//g' | cut -f 2 | sed 's/ @@//g' > ./ref_finetuned.txt

!head ./hyp_finetuned.txt
print("..........")
!head ./ref_finetuned.txt

<unk> made it accessible .
He <unk> into silence .
He was telling me his <unk> all the time .
One is South Korea .
That &apos;s not our burden .
Why have we chosen medicine ? &quot;
It &apos;s not our fight .
I sometimes go too hard .
It &apos;s moving <unk> .
And so Scott makes that call .
..........
<<unk>> made it accessible .
He <<unk>> into silence .
He kept pointing here .
One is South Korea .
It &apos;s not our burden .
Why did I go into medicine ?
It &apos;s not our struggle .
I sometimes push too hard .
It &apos;s spreading virally .
So Scott gets this call .


In [20]:
# Detokenize text        

md_en = MosesDetokenizer(lang='en')
md_fr = MosesDetokenizer(lang='fr')

with open('hyp_finetuned.txt', encoding='utf8') as fin, open('hyp_finetuned_detok.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_en.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('ref_finetuned.txt', encoding='utf8') as fin, open('ref_finetuned_detok.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_fr.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


In [21]:
# Evaluate the model
# BLEU score of 32.4 (beam=5)
!cat ./hyp_finetuned_detok.txt | sacrebleu ./ref_finetuned_detok.txt

{
 "name": "BLEU",
 "score": 32.4,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.3.1",
 "verbose_score": "72.2/48.8/31.1/18.2 (BP = 0.862 ratio = 0.871 hyp_len = 57866 ref_len = 66451)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.3.1"
}
[0m