## PyMarian Demo

Introducing python bindigs for Marian NMT.

* Source code: https://github.com/marian-nmt/marian-dev/  under `src/python` directory
* PyPI: https://pypi.org/project/pymarian/


In [None]:
!pip list | grep pymarian

In [22]:
!pymarian-eval -h

usage: pymarian-eval [-h] [-m MODEL] [-v VOCAB] [-l {comet-qe,bleurt,comet}]
                     [-V] [-] [-t MT_FILE] [-s SRC_FILE] [-r REF_FILE]
                     [-f FIELD [FIELD ...]] [-o OUT] [-a {skip,append,only}]
                     [-w WIDTH] [--debug] [--fp16] [--mini-batch MINI_BATCH]
                     [-d [DEVICES ...] | -c CPU_THREADS] [-ws WORKSPACE] [-pc]
                     [--cache CACHE]

options:
  -h, --help            show this help message and exit
  -m MODEL, --model MODEL
                        Model name, or path. Known models: bleurt-20,
                        wmt20-comet-da, wmt20-comet-qe-da, wmt20-comet-qe-
                        da-v2, wmt21-comet-da, wmt21-comet-qe-da, wmt21-comet-
                        qe-mqm, wmt22-comet-da, wmt22-cometkiwi-da,
                        wmt23-cometkiwi-da-xl, wmt23-cometkiwi-da-xxl
                        (default: wmt22-cometkiwi-da)
  -v VOCAB, --vocab VOCAB
                        Vocabulary file (default

In [None]:
!sacrebleu -t wmt23 -l en-ja --echo ?

In [23]:
!sacrebleu -t wmt23 -l en-ja --echo src GPT4-5shot | head -20 | pymarian-eval --stdin -m wmt22-cometkiwi-da 

INFO: Input field mappings: [0, 1]; expected: ('src', 'mt'), given: ['src', 'mt', 'ref']
INFO: CLI:	marian evaluate --quiet --model /mnt/home/tg/.cache/marian/metric/models--unbabel--wmt22-cometkiwi-da-marian/snapshots/2082e0fb2e7a3cde527dfd35ebb6dbf6a2e83db6/checkpoints/marian.model.bin --vocabs /mnt/home/tg/.cache/marian/metric/models--microsoft--infoxlm-large/snapshots/d616d637f0720deda963cebbfc630657d2b7d3ae/sentencepiece.bpe.spm /mnt/home/tg/.cache/marian/metric/models--microsoft--infoxlm-large/snapshots/d616d637f0720deda963cebbfc630657d2b7d3ae/sentencepiece.bpe.spm --width 4 --like comet-qe --mini-batch 16 --maxi-batch 256 --max-length 512 --max-length-crop true --workspace 8000 --average skip
0.8486
0.8494
0.8167
0.8300
0.8715
0.8487
0.8435
0.8735
0.9078
0.8822
0.8538
0.8846
0.8846
0.8842
0.8055
0.7314
0.5996
0.8883
0.8788
0.8065
INFO: Wrote 20 lines to <stdout>


In [None]:
import pymarian
print(f'pymarian {pymarian.__version__}')

# Evaluator

In [None]:
from huggingface_hub import hf_hub_download as hf_get
from pathlib import Path
from pymarian import Evaluator

model_id = "marian-nmt/chrfoid-wmt23"
model = Path(hf_get(model_id, filename="checkpoints/marian.model.bin"))
vocab = Path(hf_get(model_id, filename="vocab.spm"))

evaluator = Evaluator.new(
    model_file=Path(model), vocab_file=Path(vocab),
    like='comet-qe', quiet=True, fp16=False)

srcs =  ['Hello', 'Howdy']
mts = ['Howdy', 'Hello']
lines = [f'{s}\t{t}' for s,t in zip(srcs, mts)]
scores = evaluator.evaluate(lines)
for score in scores:
    print(f'{score:.4f}')


# release the GPU memory
del evaluator

---
# Translator API

In [24]:
import urllib
import tarfile
from pymarian import Translator

model_url = "http://data.statmt.org/romang/marian-regression-tests/models/wngt19.tar.gz"
model_dir = Path.home() / 'tmp' /  'marian-models'
model_file = str(model_dir / 'wngt19' / 'model.base.npz')
vocab_file = str(model_dir / 'wngt19' / 'en-de.spm')

if not Path(model_file).exists():
    print(f"Downloading {model_url} and extracting to {model_dir}")
    request = urllib.request.urlopen(model_url)
    with tarfile.open(fileobj=request, mode="r|gz") as tar:
        tar.extractall(path=model_dir)
    print("Downloaded and extracted model files")

translator = Translator(models=model_file, vocabs=[vocab_file, vocab_file], quiet=True)
hyp = translator.translate("Hello. Good morning.")
print(hyp)


# release the GPUs
del translator

Hallo , Guten Morgen .


---
## Trainer

### Train an NMT Model


In [None]:

data_url = "https://textmt.blob.core.windows.net/www/data/marian-tests-data.tgz"
data_dir = Path.home() / 'tmp' / 'marian-tests-data/deu-eng'
data_dir.mkdir(parents=True, exist_ok=True)
vocab_file = data_dir / 'vocab.8k.spm'
train_src = data_dir / 'sample.5k.deu'
train_tgt = train_src.with_suffix('.eng')

if not train_tgt.exists():
    print(f"Downloading data package... to {data_dir}")
    with urllib.request.urlopen(data_url) as response:
        with tarfile.open(fileobj=response, mode="r|gz") as tar:
            tar.extractall(path=data_dir.parent.parent)
    print("Downloaded the data package")

!head -n4 {train_src} {train_tgt}

vocab_file = str(vocab_file)
train_src = str(train_src)
train_tgt = str(train_tgt)


In [None]:
from pymarian import Trainer
args = {
    'type': 'transformer',
    'dim_emb': 512,
    'after': '1000u',  # stop after 500 updates
    'valid_freq': '1000u',  # validate every 250 updates
    'disp_freq': 100,
    'disp_first': 4,
    'save_freq': '100u',
    'vocabs': [vocab_file, vocab_file],
    'train_sets': [train_src, train_tgt],
    'devices' :  'all',
    'quiet': False,
}

model_file = args['model'] = f'{data_dir.parent}/model.npz'


trainer = Trainer(**args)
trainer.train()


!ls -lh {model_file}

# release the GPUs
del trainer

### Train an Evaluator Model

In [None]:
vocab_file = data_dir / 'vocab.8k.spm'
classes_file = data_dir / 'classes4f.txt'
train_file = data_dir / 'sample.5k.chrfoid-deu-eng.tsv'

assert classes_file.exists()
assert vocab_file.exists()
assert train_file.exists()

!head -n4 {train_file} {classes_file}

In [None]:


args = {
    'dim_emb': 512,
    'enc_depth': 6,
    'dec_depth': 6,
    'tied_embeddings_all': True,
    'transformer_heads': 2,
    'transformer_dim_ffn': 256,
    'transformer_ffn_activation': 'relu',
    'transformer_dropout': 0.1,
    'cost_type': 'ce-mean',
    'max_length': 80,
    'mini_batch_fit': False,
    'maxi_batch': 256,
    'optimizer_params': [0.9, 0.98, '1e-09'],
    'sync_sgd': True,
    'learn_rate': 0.0003,
    'lr_decay_inv_sqrt': [16000],
    'lr_warmup': 16000,
    'label_smoothing': 0.1,
    'clip_norm': 0,
    'exponential_smoothing': 0.0001,
    'early_stopping': 2,
    'keep_best': True,
    'beam_size': 2,
    'normalize': 1,
    'valid_metrics': ['perplexity'],
    'valid_mini_batch': 16,
    'mini_batch': 8,
    'after': '400u',
    'valid_freq': '200u',
    'disp_freq': 100,
    'disp_first': 4,
    'save_freq': '200u',
    'quiet': False,
    #'like': 'comet-qe',   # only supported at inference; for training, see task and input_types
    'task': 'comet-qe',
    'input_types': ['class', 'sequence', 'sequence'],  # required for training
    #'pretrained_model': pretrained_model,     # for finetuning; not using it because its too big for tests
    'train_sets': [train_file],  # TSV file having 3 columns: class sequence sequence
    'tsv': True,
    'tsv-fields': 3,  # or it will complain that vocabs and train_sets should be one to one map
    'vocabs': [classes_file, vocab_file, vocab_file],  # class sequence sequence
}

save_at = str(data_dir.parent / 'runs/eval.model.npz')
trainer = Trainer(model=save_at, **args)
trainer.train()