# En-Fr Translation
Generate translations on MuST-SHE dataset with WMT14 En-Fr Transformer model

In [2]:
import torch

In [2]:
%cd /export/data4/vzhekova/MuST-SHE_v1.2/MuST-SHE-v1.2-data/tsv

!cut -f5 MONOLINGUAL.fr_v1.2.tsv > data.en-fr.en
!cut -f6 MONOLINGUAL.fr_v1.2.tsv > data.en-fr.fr

/export/data4/vzhekova/MuST-SHE_v1.2/MuST-SHE-v1.2-data/tsv


In [3]:
PATH="/export/data4/vzhekova/biases-data/En-Fr"

%cd $PATH

/export/data4/vzhekova/biases-data/En-Fr


In [14]:
!echo -e "\nFirst lines of English:\n"
!head data.en-fr.en
!echo -e "\nFirst lines of French:\n"
!head data.en-fr.fr


First lines of English:

Now, I thought, "How could I really capture this?
I mean, from this entry, it would seem that I was born into a world that perceived someone like me to have nothing positive whatsoever going for them, when in fact, today I'm celebrated for the opportunities and adventures my life has procured.
So, I immediately went to look up the 2009 online edition, expecting to find a revision worth noting.
His name was Dr. Pizzutillo, an Italian American, whose name, apparently, was too difficult for most Americans to pronounce, so he went by Dr. P. And Dr. P always wore really colorful bow ties and had the very perfect disposition to work with children.
And, one day, he came in to my session — exhaustive and unforgiving, these sessions — and he said to me, "Wow.
Now, of course, this was a simple ploy on Dr. P's part to get me to do the exercises I didn't want to do before the prospect of being the richest five-year-old in the second floor ward, but what he effectively did

# Data preprocessing

- Tokenization

In [60]:
# Tokenize text; MuST-SHE dataset
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from __future__ import print_function

mpn = MosesPunctNormalizer()
mt_en = MosesTokenizer(lang='en')

with open('data.en-fr.en') as fin, open('tok.data.en-fr.en','w') as fout:
    for line in fin:
        tokens = mt_en.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout) 
        
mt_fr = MosesTokenizer(lang='fr')

with open('data.en-fr.fr') as fin, open('tok.data.en-fr.fr','w') as fout:
    for line in fin:
        tokens = mt_fr.tokenize(mpn.normalize(line), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished tokenizing.')

Finished tokenizing.


- Subword segmentation

In [None]:
# Train subword model
#!subword-nmt learn-bpe -s 32000 < data.en-fr.tok.en > sw.model.en
#!subword-nmt learn-bpe -s 32000 < data.en-fr.tok.fr > sw.model.fr

In [33]:
# Apply subword model; reuse bpecodes from existing model
!subword-nmt apply-bpe -c wmt14.en-fr.joined-dict.transformer/bpecodes < tok.data.en-fr.en > sw.data.en-fr.en
!subword-nmt apply-bpe -c wmt14.en-fr.joined-dict.transformer/bpecodes < tok.data.en-fr.fr > sw.data.en-fr.fr

print('Finished subword.')

Finished subword.


In [34]:
!echo -e "\nFirst lines of tokenized English:\n"
!head sw.data.en-fr.en

!echo -e "\nFirst lines of tokenized French:\n"
!head sw.data.en-fr.fr


First lines of tokenized English:

Now , I thought , &quot; How could I really capture this ?
I mean , from this entry , it would seem that I was born into a world that perceived someone like me to have nothing positive whatsoever going for them , when in fact , today I &apos;m celebrated for the opportunities and adv@@ entures my life has procu@@ red .
So , I immediately went to look up the 2009 online edition , exp@@ ecting to find a revision worth noting .
His name was Dr. P@@ iz@@ z@@ uti@@ ll@@ o , an Italian American , whose name , apparently , was too difficult for most Americans to pron@@ ounce , so he went by Dr. P. And Dr. P always wor@@ e really color@@ ful bow ties and had the very perfect disposition to work with children .
And , one day , he came in to my session - exhaustive and un@@ for@@ giving , these sessions - and he said to me , &quot; W@@ ow .
Now , of course , this was a simple plo@@ y on Dr. P &apos;s part to get me to do the exercises I didn &apos;t want to do

- Binarize data

In [35]:
# Binarize the data for training; test with moses tokenizer and subword_nmt failed -> BLEU score of 4

# reuse model dict; solution for dictionary size to match test dataset 
# --tokenizer moses \
# --bpe subword_nmt \
!fairseq-preprocess \
    --source-lang en \
    --target-lang fr \
    --testpref sw.data.en-fr \
    --srcdict wmt14.en-fr.joined-dict.transformer/dict.en.txt \
    --tgtdict wmt14.en-fr.joined-dict.transformer/dict.fr.txt \
    --destdir data-bin \
    --workers 8

2023-03-22 11:38:01 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=False, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='en', srcdict='wmt14.en-

# Translation

In [36]:
%cd $PATH

/export/data4/vzhekova/biases-data/En-Fr


- Beam search

In [68]:
# Generate translations
!fairseq-generate data-bin  \
    --task translation \
    --source-lang en \
    --target-lang fr \
    --path wmt14.en-fr.joined-dict.transformer/model.pt \
    --beam 5 \
    --batch-size 256 \
    --memory-efficient-fp16 \
    --remove-bpe=subword_nmt > en-fr.decode.log

2023-03-22 12:43:45 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

- Top-K-Sampling

In [52]:
# Generate translations
# !fairseq-generate data-bin  \
#     --task translation \
#     --source-lang en \
#     --target-lang fr \
#     --path wmt14.en-fr.joined-dict.transformer/model.pt \
#     --sampling \
#     --sampling-topk 5 \
#     --nbest 5 \
#     --batch-size 256 \
#     --memory-efficient-fp16 \
#     --remove-bpe=subword_nmt > en-fr.decode.log

# Evaluation

In [69]:
# Extract the hypotheses and references from the decoding log file
# Replacing subword segmentation: sed -r 's/(@@ )|(@@ ?$)//g'
!grep ^H en-fr.decode.log | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp.en-fr.txt
!grep ^T en-fr.decode.log | sed 's/^T-//g' | cut -f 2 | sed 's/ @@//g' > ./ref.en-fr.txt

!head ./hyp.en-fr.txt
print("..........")
!head ./ref.en-fr.txt

C&apos; est là que je suis né et que j&apos; ai passé les sept premières années de ma vie .
Elle a suivi un cours dans une école de commerce et elle est devenue médecin vétérinaire .
En tant que conservateur afro @-@ latino @-@ américain , je suis l&apos; un des rares .
Je suis heureux de dire qu&apos; au cours des trois dernières années , nous avons fait des progrès .
Après plusieurs séries d&apos; entrevues , je me suis inscrit au programme de bourses et j&apos; ai obtenu une bourse complète .
Je lui ai prescrit de la pénicilline et je l&apos; ai envoyé en route .
Je suis parmi tous ceux qui connaissent les précieuses contributions des dénonciateurs .
Bonjour . Je m&apos; appelle Aparna . Je suis un bourreau -
Et je pensais que c &quot; était super cool , alors je le montrais à mon ami .
Je suis quelqu&apos; un qui se passionne vraiment pour les choses , alors je vais tempérer ça .
..........
Je suis née ici même , et j&apos; y ai passé les sept premières années de ma vie .
Elle a fa

In [29]:
# Extract only the first hypothesis
!awk 'NR % 5 == 1' ./hyp.en-fr.txt > ./hyp.en-fr.best.txt

In [70]:
# Detokenize text        

md_fr = MosesDetokenizer(lang='fr')

with open('hyp.en-fr.txt', encoding='utf8') as fin, open('hyp_detok.en-fr.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_fr.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('ref.en-fr.txt', encoding='utf8') as fin, open('ref_detok.en-fr.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_fr.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


In [66]:
!head ./hyp_detok.en-fr.txt
print("..........")
!head ./ref_detok.en-fr.txt

C'est là que je suis né et que j'ai passé les sept premières années de ma vie.
Elle a suivi un cours dans une école de commerce et elle est devenue médecin vétérinaire.
En tant que conservateur afro-latino-américain, je suis l'un des rares.
Je suis heureux de dire qu'au cours des trois dernières années, nous avons fait des progrès.
Après plusieurs séries d'entrevues, je me suis inscrit au programme de bourses et j'ai obtenu une bourse complète.
Je lui ai prescrit de la pénicilline et je l'ai envoyé en route.
Je suis parmi tous ceux qui connaissent les précieuses contributions des dénonciateurs.
Bonjour. Je m'appelle Aparna. Je suis un bourreau -
Et je pensais que c "était super cool, alors je le montrais à mon ami.
Je suis quelqu'un qui se passionne vraiment pour les choses, alors je vais tempérer ça.
..........
Je suis née ici même, et j'y ai passé les sept premières années de ma vie.
Elle a fait une école de commerce puis est devenue vétérinaire.
Les conservatrices afro-latinas ne so

- BLEU score (sacreBleu works on detokenized input)

In [71]:
# Evaluate the model
# BLEU score of 18.7 (beam=5), BLEU score of 18.2 (beam=1)
# BLEU score of 37.2 (beam=5) when using bpecodes

# BLEU score of 15.9 (sampling-topk=5)
!cat ./hyp_detok.en-fr.txt | sacrebleu ./ref_detok.en-fr.txt

{
 "name": "BLEU",
 "score": 37.2,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.3.1",
 "verbose_score": "63.4/42.6/31.0/22.8 (BP = 1.000 ratio = 1.059 hyp_len = 34334 ref_len = 32422)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.3.1"
}
[0m

In [33]:
!fairseq-score \
    --sacrebleu \
    --sys ./hyp_detok.en-fr.txt \
    --ref ./ref_detok.en-fr.txt

Namespace(ignore_case=False, order=4, ref='./ref_detok.en-fr.txt', sacrebleu=True, sentence_bleu=False, sys='./hyp_detok.en-fr.txt')
Traceback (most recent call last):
  File "/home/vzhekova/miniconda3/envs/nmt/bin/fairseq-score", line 8, in <module>
    sys.exit(cli_main())
  File "/home/vzhekova/fairseq/fairseq_cli/score.py", line 98, in cli_main
    score(f)
  File "/home/vzhekova/fairseq/fairseq_cli/score.py", line 61, in score
    print(sacrebleu.corpus_bleu(fdsys, [fdref]).format())
  File "/home/vzhekova/miniconda3/envs/nmt/lib/python3.8/site-packages/sacrebleu/compat.py", line 37, in corpus_bleu
    return metric.corpus_score(hypotheses, references)
  File "/home/vzhekova/miniconda3/envs/nmt/lib/python3.8/site-packages/sacrebleu/metrics/base.py", line 414, in corpus_score
    self._check_corpus_score_args(hypotheses, references)
  File "/home/vzhekova/miniconda3/envs/nmt/lib/python3.8/site-packages/sacrebleu/metrics/base.py", line 258, in _check_corpus_score_args
    raise Type

- Gender accuracy

In [58]:
# Evaluate gender accuracy; 0.52
!python /export/data4/vzhekova/MuST-SHE_v1.2/MuST-SHE-v1.1-eval-script/mustshe_acc_v1.1.py \
    --input hyp.en-fr.txt \
    --tsv-definition /export/data4/vzhekova/MuST-SHE_v1.2/MuST-SHE-v1.2-data/tsv/MONOLINGUAL.fr_v1.2.tsv

Category	Term Coverage	Gender Accuracy
-------------------------------------------------
1F	0.04716981132075472	0.4782608695652174
1M	0.07073170731707316	0.6666666666666666
2F	0.08227848101265822	0.45098039215686275
2M	0.13009708737864079	0.5616438356164384
3F	0.0	0.0
3M	0.0	0.0
4F	0.14814814814814814	0.2
4M	0.125	0.375
-------------------------------------------------
Global	0.08656873032528856	0.5233160621761658
-------------------------------------------------


# Translation with fine-tuned model

In [41]:
%cd $PATH

/export/data4/vzhekova/biases-data/En-Fr


In [42]:
# Generate translations
!fairseq-generate data-bin  \
    --task translation \
    --source-lang en \
    --target-lang fr \
    --path /export/data4/vzhekova/biases-data/En-Fr_MuST-C/checkpoints/checkpoint_best.pt \
    --beam 5 \
    --batch-size 256 \
    --memory-efficient-fp16 \
    --remove-bpe=subword_nmt > en-fr.decode_finetuned.log

2023-03-22 11:43:12 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': 

In [43]:
# Extract the hypotheses and references from the decoding log file
!grep ^H en-fr.decode_finetuned.log | sed 's/^H-//g' | cut -f 3 | sed 's/ @@//g' > ./hyp_finetuned.txt
!grep ^T en-fr.decode_finetuned.log | sed 's/^T-//g' | cut -f 2 | sed 's/ @@//g' > ./ref_finetuned.txt

!head ./hyp_finetuned.txt
print("..........")
!head ./ref_finetuned.txt

C&apos; est sur ces terrains que je suis née et que j&apos; ai passé les sept premières années de ma vie .
Elle a suivi un cours dans une école de commerce , et elle est devenue vétérinaire .
En tant que conservateur afro-latin , je suis l&apos; un des rares .
Je suis ravi de pouvoir dire qu&apos; au cours des trois dernières années , nous avons fait des progrès .
Après de nombreux entretiens , j&apos; ai rejoint le programme de bourses avec une bourse complète .
Et je lui ai donné une ordonnance de pénicilline et je l&apos; ai envoyé sur son chemin .
Je sais les contributions précieuses que les dénonciateurs font .
Bonjour . Je m&apos; appelle Aparna .
Et je pensais que c&apos; était super cool , alors je le montrais à mon ami .
Je suis quelqu&apos; un qui est vraiment excité par les choses , alors je l&apos; atténuerais .
..........
Je suis née ici même , et j&apos; y ai passé les sept premières années de ma vie .
Elle a fait une école de commerce puis est devenue vétérinaire .
Les c

In [48]:
# Detokenize text        

md_fr = MosesDetokenizer(lang='fr')

with open('hyp_finetuned.txt', encoding='utf8') as fin, open('hyp_finetuned_detok.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_fr.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)
        
with open('ref_finetuned.txt', encoding='utf8') as fin, open('ref_finetuned_detok.txt','w', encoding='utf8') as fout:
    for line in fin:
        tokens = md_fr.detokenize(line.split(), return_str=True)
        print(tokens, end='\n', file=fout)

print('Finished detokenizing.')

Finished detokenizing.


In [49]:
# Evaluate the model
# BLEU score of 27.0 (beam=5)
# BLEU score of 39.6 (beam=5) when using bpecodes
!cat ./hyp_finetuned_detok.txt | sacrebleu ./ref_finetuned_detok.txt

{
 "name": "BLEU",
 "score": 39.6,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.3.1",
 "verbose_score": "65.7/45.3/33.3/24.9 (BP = 1.000 ratio = 1.037 hyp_len = 33619 ref_len = 32422)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.3.1"
}
[0m

In [12]:
# Evaluate gender accuracy; 0.49
!python /export/data4/vzhekova/MuST-SHE_v1.2/MuST-SHE-v1.1-eval-script/mustshe_acc_v1.1.py \
    --input hyp_finetuned_detok.txt \
    --tsv-definition /export/data4/vzhekova/MuST-SHE_v1.2/MuST-SHE-v1.2-data/tsv/MONOLINGUAL.fr_v1.2.tsv

Category	Term Coverage	Gender Accuracy
-------------------------------------------------
1F	0.030660377358490566	0.375
1M	0.06829268292682927	0.6571428571428571
2F	0.10548523206751055	0.39285714285714285
2M	0.11262135922330097	0.5285714285714286
3F	0.0	0.0
3M	0.0	0.0
4F	0.07407407407407407	0.0
4M	0.16666666666666666	0.5555555555555556
-------------------------------------------------
Global	0.08342077649527807	0.4946808510638298
-------------------------------------------------
