In [17]:
import os
import gzip
from tqdm import tqdm

def txt_pairs_to_tsv_gz(inputs1, inputs2, out, ignore_assertation_keys=[], n_samples=-1):
    assert type(inputs1) is list
    assert type(inputs2) is list
    assert 'en' in inputs1[0]
    assert len(inputs1) == len(inputs2)
    
    os.makedirs(os.path.dirname(out), exist_ok=True)

    n = 0
    done = False
    with gzip.open(out, 'wt', encoding='utf8') if out.endswith('.gz') else open(out, 'w', encoding='utf8') as wf:
        for path1, path2 in tqdm(zip(inputs1, inputs2)):
            print(path1, path2)
            f1 = open(path1, 'r', encoding='utf8')
            f2 = open(path2, 'r', encoding='utf8')
            while True:
                line1 = f1.readline()
                line2 = f2.readline()

                if line1 and not line1.strip():
                    print(line1, line2)
                    continue

                if line2 and not line2.strip():
                    print(line1, line2)
                    continue

                if not line1:
                    if not any(k in path1 for k in ignore_assertation_keys):
                        assert not line2
                    break
                n += 1
                
                line1 = line1.strip().replace("\t", " ")
                line2 = line2.strip().replace("\t", " ")
                wf.write(f'{line1}\t{line2}\n')

                if n == n_samples:
                    done = True
                    break
                
            f1.close()
            f2.close()
            print(f'have processed {n} lines...')
            if done:
                break
    print(f'There are {n} lines')

# WMT 17: en-zh

1. download en-zh corpora from the [official website](https://www.statmt.org/wmt17/translation-task.html)
2. note that the original released `neu2017.zip` has a problem, you can download it from [here](https://huggingface.co/datasets/wmt/wmt18/tree/main/cwmt-wmt)
3. unzip all files, the resulting folders are as follows:
    - casia2015
    - casict2011
    - casict2015
    - datum2015
    - datum2017
    - neu2017
    - training
    - en-zh
4. run the following cells to prepare corpora for `sentence-transformers`

In [19]:
import glob

en = sorted(glob.glob('./*/*_en.txt') + glob.glob('./*/*en-zh.en') + glob.glob('./*/*zh-en.en'))
zh = sorted(glob.glob('./*/*_ch.txt') + glob.glob('./*/*en-zh.zh') + glob.glob('./*/*zh-en.zh') + glob.glob('./*/*_cn.txt'))

txt_pairs_to_tsv_gz(
    en, zh, 
    'corpus/wmt/1M_en-zh.tsv',
    n_samples=1111391,
)

0it [00:00, ?it/s]

.\casia2015\casia2015_en.txt .\casia2015\casia2015_ch.txt


1it [00:10, 10.23s/it]

have processed 1050000 lines...
.\casict2011\casict-A_en.txt .\casict2011\casict-A_ch.txt


1it [00:10, 10.61s/it]

have processed 1111391 lines...
There are 1111391 lines





# WMT 14: En-Fr

1. download en-zh corpora from the [official website](https://www.statmt.org/wmt14/translation-task.html)
2. unzip all files, the resulting folders and files are as follows:
    - training/
    - un/
    - giga-fren.release2.fixed.en
    - giga-fren.release2.fixed.fr
3. run the following cells to prepare corpora for `sentence-transformers`

In [18]:
import glob

en = sorted(glob.glob('./*/*fr-en.en') + ['./giga-fren.release2.fixed.en'])
en = [item for item in en if 'news-commentary-v12' not in item] # news-commentary-v12 is from WMT17

fr = sorted(glob.glob('./*/*fr-en.fr') + ['./giga-fren.release2.fixed.fr'])
fr = [item for item in fr if 'news-commentary-v12' not in item] # news-commentary-v12 is from WMT17

assert len(en) == len(fr) == 5

txt_pairs_to_tsv_gz(
    en, fr, 
    'corpus/wmt/1M_en-fr.tsv',
    n_samples=1111391,
)

0it [00:00, ?it/s]

./giga-fren.release2.fixed.en ./giga-fren.release2.fixed.fr
   
      

 
    

 
 Rosemary Brown, 1977 

 
 (28 novembre 2006). 

 
 100, comparativement à la moyenne nationale qui est de 13 p.

 
 Suivez ce lien pour enregistrer votre entreprise en ligne. 

• The Magnesium Home Page
  

 
 • Corporation commerciale canadienne (CCC) :

Français
  

 
 • Question 27 :

 
 • Nous avons respecté notre engagement.

 
 • Nous serons prêts à accueillir les quelque 10 000 journalistes accrédités.

 
 • Selon ce sondage, 58 p.

 
 100, ce qui est comparable aux années précédentes. 

•  
  

 
 100 de la population en 2001. 

"Canada is a nation of immigrants.
  

 
 •

 
 Les centres d'amitié autochtones reconnus par le Programme soumettent une demande directement à l'ANCA.

 
 Extrants :

 
 • D’autres phénomènes peuvent aussi être à l’origine d’erreurs.

 
 100) dans les provinces de l’Atlantique. 

 
 • Formation et développement professionnel pour jeunes artistes :

• Selective component:

0it [00:11, ?it/s]

have processed 1111391 lines...
There are 1111391 lines





# WMT 14: En-De

1. download en-zh corpora from the [official website](https://www.statmt.org/wmt14/translation-task.html)
3. unzip all files, the resulting folders and files are as follows:
    - training/
4. run the following cells to prepare corpora for `sentence-transformers`

In [20]:
import glob

en = sorted(glob.glob('./*/*de-en.en'))
en = [item for item in en if 'news-commentary-v12' not in item] # news-commentary-v12 is from WMT17

de = sorted(glob.glob('./*/*de-en.de'))
de = [item for item in de if 'news-commentary-v12' not in item] # news-commentary-v12 is from WMT17

assert len(en) == len(de) == 3

txt_pairs_to_tsv_gz(
    en, de, 
    'corpus/wmt/1M_en-de.tsv',
    n_samples=1111391,
)

0it [00:00, ?it/s]

.\training\commoncrawl.de-en.en .\training\commoncrawl.de-en.de


0it [00:13, ?it/s]

have processed 1111391 lines...
There are 1111391 lines





In [21]:
import random
random.seed(0)
n_samples=1111391

enligsh_sentences = []
for file in ['corpus/wmt/1M_en-zh.tsv', 'corpus/wmt/1M_en-de.tsv', 'corpus/wmt/1M_en-fr.tsv']:
    data = open(file, encoding='utf8').read().strip().split('\n')
    data = [line.split('\t')[0] for line in data]
    print(data[:3])
    enligsh_sentences.extend(data)

random.shuffle(enligsh_sentences)
print(enligsh_sentences[:3])

with open('corpus/wmt/1M_en.tsv', 'w', encoding='utf8') as wf:
    wf.write('\n'.join(enligsh_sentences[:n_samples]))

['The show stars the X Girls - a troupe of talented topless dancers, some of whom are classically trained.', 'The centerpiece of the show is a farcical rendition of Swan Lake in which male and female performers dance in pink tutus and imitate swans.', 'The removal of the barrier between performance and post-production was just as helpful for the actors.']
['iron cement is a ready for use paste which is laid as a fillet by putty knife or finger in the mould edges (corners) of the steel ingot mould.', 'iron cement protects the ingot against the hot, abrasive steel casting process.', 'a fire restant repair cement for fire places, ovens, open fireplaces etc.']
['Changing Lives | Changing Society | How It Works | Technology Drives Change Home | Concepts | Teachers | Search | Overview | Credits | HHCC Web | Reference | Feedback Virtual Museum of Canada Home Page', 'Site map', 'Feedback']
["As government spokesmen have pointed out, however, Sri Lanka's voters will at least have the chance to 