In [None]:
# mount the drive onto here
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
# select the source and target language code
source_language = "uz"
target_language = "tr" 

# this is for bilingual
experiment_name = "bilingual_baseline" 

os.environ["src"] = source_language 
os.environ["tgt"] = target_language
os.environ["tag"] = experiment_name

# This will save it to a folder in our gdrive instead!
!mkdir -p "/content/drive/My Drive/jim/experiments/$src-$tgt-$tag"
os.environ["gdrive_path"] = "/content/drive/My Drive/jim/experiments/%s-%s-%s" % (source_language, target_language, experiment_name)
os.environ['data_path'] = "/content/drive/My Drive/corpus/train"

os.environ['dev_path'] = "/content/drive/My Drive/corpus/dev"
os.environ['test_bible'] = "/content/drive/My Drive/corpus/test/bible"
#os.environ['test_ted'] = "/content/drive/My Drive/corpus/test_set/ted"
#os.environ['test_wmt'] = "/content/drive/My Drive/corpus/test_set/x-wmt"



In [None]:
# check if the drive link ia good
!echo "$gdrive_path"
!echo "$data_path"


In [None]:
source_path = f"{os.environ['data_path']}/{source_language}-{target_language}/{source_language}-{target_language}.{source_language}" 
target_path = f"{os.environ['data_path']}/{source_language}-{target_language}/{source_language}-{target_language}.{target_language}" 

source = open(source_path, "r").readlines()
target = open(target_path, "r").readlines()

assert len(source) == len(target)

print(f"Found a total of training {len(source)} samples!")


In [None]:
dev_source_path = f"{os.environ['dev_path']}/{source_language}-{target_language}/{source_language}-{target_language}.{source_language}" 
dev_target_path = f"{os.environ['dev_path']}/{source_language}-{target_language}/{source_language}-{target_language}.{target_language}" 

dev_source = open(dev_source_path, "r").readlines()
dev_target = open(dev_target_path, "r").readlines()

assert len(dev_source) == len(dev_target)

print(f"Found a total of dev {len(dev_source)} samples!")

In [None]:
test_source_path = f"{os.environ['test_bible']}/{source_language}-{target_language}/{source_language}-{target_language}.{source_language}" 
test_target_path = f"{os.environ['test_bible']}/{source_language}-{target_language}/{source_language}-{target_language}.{target_language}" 

test_source = open(test_source_path, "r").readlines()
test_target = open(test_target_path, "r").readlines()

assert len(test_source) == len(test_target)

print(f"Found a total of test (Bible) {len(test_source)} samples!")

In [None]:
# check the new length of the data (before deduplication)
print(f"Train set: {len(source)} sentences")
print(f"Dev set: {len(dev_source)} sentences")
print(f"Test set (bible): {len(test_source)} sentences")


In [None]:
# load the data into a pandas dataframe
import pandas as pd
df = pd.DataFrame(zip(source, target), columns=['source_sentence', 'target_sentence'])
df_dev = pd.DataFrame(zip(dev_source, dev_target), columns=['source_sentence', 'target_sentence'])
df_bible = pd.DataFrame(zip(test_source, test_target), columns=['source_sentence', 'target_sentence'])

In [None]:
# This section does the split between train/dev/test for the parallel corpora then saves them as separate files
import csv

with open("train."+source_language, "w") as src_file, open("train."+target_language, "w") as trg_file:
  for index, row in df_pp.iterrows():
    src_file.write(row["source_sentence"])
    trg_file.write(row["target_sentence"])
    
with open("dev."+source_language, "w") as src_file, open("dev."+target_language, "w") as trg_file:
  for index, row in df_dev.iterrows():
    src_file.write(row["source_sentence"])
    trg_file.write(row["target_sentence"])
  
with open("test."+source_language, "w") as src_file, open("test."+target_language, "w") as trg_file:
  for index, row in df_bible.iterrows():
    src_file.write(row["source_sentence"])
    trg_file.write(row["target_sentence"])

# TODO: Doublecheck the format below. There should be no extra quotation marks or weird characters. It should also not be empty.
! head train.*
! head dev.*
! head test.*


In [None]:
! pip install sacremoses

source_file = "train." + source_language
target_file = "train." + target_language

dev_source_file = "dev." + source_language
dev_target_file = "dev." + target_language

test_source_file = "test." + source_language
test_target_file = "test." + target_language

tok_source_file = source_file+".tok"
tok_target_file = target_file+".tok"

dev_tok_source_file = dev_source_file+".tok"
dev_tok_target_file = dev_target_file+".tok"

test_tok_source_file = test_source_file+".tok"
test_tok_target_file = test_target_file+".tok"

# Tokenize the source
! sacremoses -l "$source_language" tokenize < "$source_file" > "$tok_source_file"
# Tokenize the target
! sacremoses -l "$target_language" tokenize < "$target_file" > "$tok_target_file"

# Tokenize the source
! sacremoses -l "$source_language" tokenize < "$dev_source_file" > "$dev_tok_source_file"
# Tokenize the target
! sacremoses -l "$target_language" tokenize < "$dev_target_file" > "$dev_tok_target_file"

# Tokenize the source
! sacremoses -l "$source_language" tokenize < "$test_source_file" > "$test_tok_source_file"
# Tokenize the target
! sacremoses -l "$target_language" tokenize < "$test_target_file" > "$test_tok_target_file"



# Let's take a look what tokenization did to the text.
! head "$source_file"*
! head "$target_file"*

# Let's take a look what tokenization did to the text.
! head "$dev_source_file"*
! head "$dev_target_file"*

# Let's take a look what tokenization did to the text.
! head "$test_source_file"*
! head "$test_target_file"*


In [None]:
!git clone https://github.com/joeynmt/joeynmt.git
!cd joeynmt; pip3 install .
!pip install torch==1.8.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
# One of the huge boosts in NMT performance was to use a different method of tokenizing. 
# Usually, NMT would tokenize by words. However, using a method called BPE gave amazing boosts to performance

# Do subword NMT
from os import path

# Learn BPEs on the training data.
os.environ["data_path"] = path.join("joeynmt", "data", source_language + target_language) # Herman! 

! subword-nmt learn-joint-bpe-and-vocab --input train.$src.tok train.$tgt.tok -s 32000 -o bpe.codes.32000 --write-vocabulary vocab.$src vocab.$tgt

# Apply BPE splits to the development and test data.
! subword-nmt apply-bpe -c bpe.codes.32000 --vocabulary vocab.$src < train.$src.tok > train.bpe.$src
! subword-nmt apply-bpe -c bpe.codes.32000 --vocabulary vocab.$tgt < train.$tgt.tok > train.bpe.$tgt

! subword-nmt apply-bpe -c bpe.codes.32000 --vocabulary vocab.$src < dev.$src.tok > dev.bpe.$src
! subword-nmt apply-bpe -c bpe.codes.32000 --vocabulary vocab.$tgt < dev.$tgt.tok > dev.bpe.$tgt

! subword-nmt apply-bpe -c bpe.codes.32000 --vocabulary vocab.$src < test.$src.tok > test.bpe.$src
! subword-nmt apply-bpe -c bpe.codes.32000 --vocabulary vocab.$tgt < test.$tgt.tok > test.bpe.$tgt

# Create directory, move everyone we care about to the correct location
! mkdir -p "$data_path"
! cp train.* "$data_path"
! cp test.* "$data_path"
! cp dev.* "$data_path"
! cp bpe.codes.32000 "$data_path"
! ls "$data_path"

# Also move everything we care about to a mounted location in google drive (relevant if running in colab) at gdrive_path
! cp train.* "$gdrive_path"
! cp test.* "$gdrive_path"
! cp dev.* "$gdrive_path"
! cp bpe.codes.32000 "$gdrive_path"
! ls "$gdrive_path"

# Create that vocab using build_vocab
! sudo chmod 777 joeynmt/scripts/build_vocab.py
! joeynmt/scripts/build_vocab.py joeynmt/data/$src$tgt/train.bpe.$src joeynmt/data/$src$tgt/train.bpe.$tgt --output_path joeynmt/data/$src$tgt/vocab.txt

# Some output
! echo "BPE Test language Sentences"
! tail -n 5 test.bpe.$tgt
! echo "Combined BPE Vocab"
! tail -n 10 joeynmt/data/$src$tgt/vocab.txt  # Herman

In [None]:
# This creates the config file for our JoeyNMT system. It might seem overwhelming so we've provided a couple of useful parameters you'll need to update
# (You can of course play with all the parameters if you'd like!)

name = '%s%s' % (source_language, target_language)
gdrive_path = os.environ["gdrive_path"]

# Create the config
config = """
name: "{name}_transformer"

data:
    src: "{source_language}"
    trg: "{target_language}"
    train: "data/{name}/train.bpe"
    dev:   "data/{name}/dev.bpe"
    test:  "data/{name}/test.bpe"  # change this to data/{name}/test2.bpe so that you can test it on Ted Talks
    level: "bpe"
    lowercase: False
    max_sent_length: 128
    src_vocab: "data/{name}/vocab.txt"
    trg_vocab: "data/{name}/vocab.txt"

testing:
    beam_size: 5
    alpha: 1.0
    sacrebleu:                      # sacrebleu options
        remove_whitespace: True     # `remove_whitespace` option in sacrebleu.corpus_chrf() function (defalut: True)
        tokenize: "none"            # `tokenize` option in sacrebleu.corpus_bleu() function (options include: "none" (use for already tokenized test data), "13a" (default minimal tokenizer), "intl" which mostly does punctuation and unicode, etc) 

training:
    #load_model: "{gdrive_path}/models/{name}_transformer/best.ckpt" # if uncommented, load a pre-trained model from this checkpoint
    random_seed: 42
    optimizer: "adam"
    normalization: "tokens"
    adam_betas: [0.9, 0.998] 
    scheduling: "plateau"           
    patience: 7                     
    learning_rate_factor: 0.5       
    learning_rate_warmup: 4000     
    decrease_factor: 0.7
    loss: "crossentropy"
    learning_rate: 0.0003          
    learning_rate_min: 0.00000001
    weight_decay: 0.0
    label_smoothing: 0.1
    batch_size: 4096
    batch_type: "token"
    eval_batch_size: 4096
    eval_batch_type: "token"
    batch_multiplier: 8
    early_stopping_metric: "ppl"
    epochs: 3000                     
    validation_freq: 500          
    logging_freq: 100
    eval_metric: "bleu"
    model_dir: "{gdrive_path}/models/{name}_transformer"
    overwrite: True              # TODO: Set to True if you want to overwrite possibly existing models. 
    shuffle: True
    use_cuda: True
    fp16: False
    max_output_length: 128
    print_valid_sents: [0, 1, 2, 3]
    keep_last_ckpts: 3

model:
    initializer: "xavier"
    bias_initializer: "zeros"
    init_gain: 1.0
    embed_initializer: "xavier"
    embed_init_gain: 1.0
    tied_embeddings: True
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 6
        num_heads: 8             
        embeddings:
            embedding_dim: 512   
            scale: True
            dropout: 0.2
        # typically ff_size = 4 x hidden_size
        hidden_size: 512         
        ff_size: 2048            
        dropout: 0.3
    decoder:
        type: "transformer"
        num_layers: 6
        num_heads: 8             
        embeddings:
            embedding_dim: 512    
            scale: True
            dropout: 0.2
        # typically ff_size = 4 x hidden_size
        hidden_size: 512         
        ff_size: 2048           
        dropout: 0.3
""".format(name=name, gdrive_path=os.environ["gdrive_path"], source_language=source_language, target_language=target_language)
with open("joeynmt/configs/transformer_{name}.yaml".format(name=name),'w') as f:
    f.write(config)

In [None]:
# This may take a few minutes to install but it will speed up your training a lot!
!git clone https://github.com/NVIDIA/apex
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [None]:
!cd joeynmt; python -m joeynmt train configs/transformer_$src$tgt.yaml


In [None]:
# Output our validation accuracy
! cat "$gdrive_path/models/${src}${tgt}_transformer/validations.txt"

In [None]:
# Test our model
! cd joeynmt; python3 -m joeynmt test "$gdrive_path/models/${src}${tgt}_transformer/config.yaml"