In [1]:
! nvidia-smi

Tue May  7 19:52:31 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.56       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    28W /  70W |   1177MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [2]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

success!


In [0]:
from progressbar import ProgressBar, Percentage, Bar
from google.colab import files
import torch, pickle, os, sys, random, time, math, copy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from queue import PriorityQueue

In [4]:
########################### ENTIRE DATASET (WITH BPE) ###########################
german = drive.CreateFile({'id': '1KNeZ_WQPUudwlJsg5_kUqPlLjJGBynsn'})
german.GetContentFile('./german_bpe.pickle') 
with open('./german_bpe.pickle', 'rb') as f_in:
    german = pickle.load(f_in)
    
english = drive.CreateFile({'id': '1ObXLocsuZsVGH3MVxSUdDBpiMlgBqVP7'})
english.GetContentFile('./english_bpe.pickle') 
with open('./english_bpe.pickle', 'rb') as f_in:
    english = pickle.load(f_in)
    
training_data = [[german['train'][i], english['train'][i]] for i in range(len(german['train']))]
validation_data = [[german['dev'][i], english['dev'][i]] for i in range(len(german['dev']))]

max_len_train = len(max(german['train'], key=len))
max_len_valid = len(max(german['dev'], key=len))

print(max_len_train, max_len_valid)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

906 281
cuda:0


In [0]:
def to_padded_tensor(batch):
    max_len_src = max([len(sent['source']) for sent in batch])  
    max_len_trg = max([len(sent['target']) for sent in batch])
    source_lengths = torch.zeros(len(batch), dtype=torch.int64).cpu()

    for i, sent in enumerate(batch):
        source_lengths[i] = len(sent['source'])
        dif_src = max_len_src - len(sent['source'])
        dif_trg = max_len_trg - len(sent['target'])

        if dif_src > 0:
            pad_list_src = [0 for d in range(dif_src)]
            sent['source'].extend(pad_list_src)

        if dif_trg > 0:
            pad_list_trg = [0 for d in range(dif_trg)]
            sent['target'].extend(pad_list_trg)       
    
    source_sent_len = max_len_src
    target_sent_len = max_len_trg
    
    batch_size = len(batch)
    
    source = torch.empty((source_sent_len, batch_size)).long().cpu()    
    target = torch.empty((target_sent_len, batch_size)).long().cpu()
    
#     print(batch[0]['target'])
    
    for i in range(len(batch)):
        source[:,i] = torch.tensor(batch[i]['source'])
        target[:,i] = torch.tensor(batch[i]['target'])
        
    padded_tensor = {"source": source.to(device),
                    "target": target.to(device),
                    "srclen": source_lengths}
    
    return padded_tensor

In [0]:
def bake_batches(de, en, batch_size=1300, min_len=3, max_len=768, bucket_step=3):
    german = copy.deepcopy(de)
    english = copy.deepcopy(en)
    
    buckets = [[] for i in range(0, max_len, bucket_step)]
    bucket_lengths = [0 for i in buckets]
    batches = []
    
    # For every sentence in the dataset, find its corresponding bucket and put it in there, once the bucket
    # hits the batch size, ship it off to the batches list
    for i in range(len(german)):
        sent2sent = {"source": german[i],
                     "target": english[i]}
        
        # calculate the index of the buckets to put the sentence into, = len(Sentence) // Bucket_step - 1
        idx = len(sent2sent['source'])//bucket_step - 1 

        if bucket_lengths[idx] + len(sent2sent['source']) > batch_size:
            sorted_bucket = sorted(buckets[idx], key=lambda x: len(x['source']), reverse=True)
            batches.append(to_padded_tensor(sorted_bucket))
            del buckets[idx][:]
            buckets[idx].append(sent2sent)
            bucket_lengths[idx] = len(sent2sent['source'])
        else:
            buckets[idx].append(sent2sent)
            bucket_lengths[idx] += len(sent2sent['source'])
            

    # for any remaining buckets that did not get sent off, send them off to batches
    for b in buckets:
        if b: # if the list has any value in it
            sorted_bucket = sorted(b, key=lambda x: len(x['source']), reverse=True)
            batches.append(to_padded_tensor(sorted_bucket))
            del b[:]
    
    return batches

In [0]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, num_layers=2, bidirectional = True, dropout=0.5)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_len):
        
        #src = [src sent len, batch size]
        #src_len = [src sent len]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src sent len, batch size, emb dim]
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len)
        
        packed_outputs, hidden = self.rnn(packed_embedded)
                     
        #packed_outputs is a packed sequence containing all hidden states
        #hidden is now from the final non-padded element in the batch
            
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs) 
            
        #outputs is now a non-packed sequence, all hidden states obtained
        #  when the input is a pad token are all zeros
            
        #outputs = [sent len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
#         print(hidden.shape)
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [sent len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden

In [0]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Parameter(torch.rand(dec_hid_dim))
        
    def forward(self, hidden, encoder_outputs, mask):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
        #mask = [batch size, src sent len]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat encoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src sent len, dec hid dim]
        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src sent len, dec hid dim]
                
        energy = energy.permute(0, 2, 1)
        
        #energy = [batch size, dec hid dim, src sent len]
        
        #v = [dec hid dim]
        
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        
        #v = [batch size, 1, dec hid dim]
            
        attention = torch.bmm(v, energy).squeeze(1)
        
        #attention = [batch size, src sent len]
        
        attention = attention.masked_fill(mask == 0, -1e10)
        
        return F.softmax(attention, dim = 1)

In [0]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)#, num_layers=2, dropout=0.3)
        
        self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs, mask):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
        #mask = [batch size, src sent len]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs, mask)
                
        #a = [batch size, src sent len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src sent len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
#         print(hidden.shape)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [sent len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #sent len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        output = self.out(torch.cat((output, weighted, embedded), dim = 1))
        
        #output = [bsz, output dim]
        
        return output, hidden.squeeze(0), a.squeeze(1)

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, pad_idx, sos_idx, eos_idx, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.device = device
        
    def create_mask(self, src):
        mask = (src != self.pad_idx).permute(1, 0)
        return mask
        
    def forward(self, src, src_len, trg, teacher_forcing_ratio = 1):
        
        #src = [src sent len, batch size]
        #src_len = [batch size]
        #trg = [trg sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        if trg is None:
            assert teacher_forcing_ratio == 0, "Must be zero during inference"
            inference = True
            trg = torch.zeros((100, src.shape[1])).long().fill_(self.sos_idx).to(src.device)
        else:
            inference = False
            
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #tensor to store attention
        attentions = torch.zeros(max_len, batch_size, src.shape[0]).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src, src_len)
                
        #first input to the decoder is the <sos> tokens
        output = trg[0,:]
        
        mask = self.create_mask(src)
#         print(hidden.shape)
                
        #mask = [batch size, src sent len]
                
        for t in range(1, max_len):
            output, hidden, attention = self.decoder(output, hidden, encoder_outputs, mask)
            outputs[t] = output
            attentions[t] = attention
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)
            if inference and output.item() == self.eos_idx:
                return outputs[:t], attentions[:t]
            
        return outputs, attentions

In [0]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch['source']
        src_len = batch['srclen']
        trg = batch['target']
        
        optimizer.zero_grad()
        
        output, attetion = model(src, src_len, trg)
        if i%(len(iterator)//4) == 0: 
            print( "batch:", i)#, ",", torch.cuda.memory_allocated(device)/1e6, "MB used") 
        
        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            
            src = batch['source']
            src_len = batch['srclen']
            trg = batch['target']

            output, attention = model(src, src_len, trg, 0) #turn off teacher forcing

            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            #trg = [(trg sent len - 1) * batch size]
            #output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [17]:
def average_sentence_size(german, desired_batch_size=128):
    summation = 0
    for i in german['train']:
        summation += len(i)
    avg_tok_sent = summation/len(german['train'])
    
    print("To get an average batch size of", desired_batch_size, "Use a batch_size value of:", int(desired_batch_size*avg_tok_sent))
    
average_sentence_size(german, desired_batch_size=100)

To get an average batch size of 100 Use a batch_size value of: 2717


In [0]:
def write_to_drive(filename):
    from googleapiclient.discovery import build
    drive_service = build('drive', 'v3')

    from googleapiclient.http import MediaFileUpload

    file_metadata = {
      'name': filename
    }
    media = MediaFileUpload(filename, 
                            resumable=True)
    created = drive_service.files().create(body=file_metadata,
                                           media_body=media,
                                           fields='id').execute()
    print('File ID: {}'.format(created.get('id')))

In [21]:
N_EPOCHS = 30
CLIP = 1

INPUT_DIM = len(german['idx2word'])
OUTPUT_DIM = len(english['idx2word'])
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
PAD_IDX = 0
SOS_IDX = 2
EOS_IDX = 3

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, PAD_IDX, SOS_IDX, EOS_IDX, device).to(device)

print(f'The model has {count_parameters(model):,} trainable parameters')

model.apply(init_weights)

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)
optimizer = optim.Adam(model.parameters())

best_train_loss = float('inf')
bad_epoch_cnt = 0

results = {"hyperparams": (ENC_EMB_DIM, DEC_HID_DIM),
           "train_loss": [],
           "valid_loss": [],
           "train_ppl": [],
           "valid_ppl": []}

valid_batches = bake_batches(german['dev'], english['dev'], batch_size=2400, max_len=max_len_valid)

for epoch in range(N_EPOCHS):
    random.shuffle(training_data)

    de_shuffled_td = [td[0] for td in training_data]
    en_shuffled_td = [td[1] for td in training_data]

    print("Baking batches...", end= " ")
    train_batches = bake_batches(de_shuffled_td, en_shuffled_td, batch_size=2800, max_len=max_len_train)
    print("Done.")
    
    start_time = time.time()

    train_loss = train(model, train_batches, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_batches, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print("Validation loss has not improved in", bad_epoch_cnt, "epochs")        
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    
    # Save the last epoch over the checkpoint
    checkpoint = {}
    checkpoint['state_dict'] = model.state_dict()
    checkpoint['optimizer'] = optimizer.state_dict()
    checkpoint['epoch'] = epoch
    file_name = 'last_checkpoint.pth'        
    torch.save(checkpoint, file_name)
    write_to_drive(file_name)
    
    results['train_loss'].append(train_loss)
    results['valid_loss'].append(valid_loss)
    results['train_ppl'].append(math.exp(train_loss))
    results['valid_ppl'].append(math.exp(valid_loss))
    

The model has 37,792,291 trainable parameters
Baking batches... Done.
batch: 0
batch: 1
batch: 2
batch: 3
Validation loss has not improved in 0 epochs
Epoch: 01 | Time: 0m 24s
	Train Loss: 9.346 | Train PPL: 11455.372
	 Val. Loss: 9.264 |  Val. PPL: 10546.362
File ID: 1cr8aALzebBlNSTHv7DI-_xGrY9tBwZfm
Baking batches... Done.
batch: 0
batch: 1
batch: 2
batch: 3
Validation loss has not improved in 0 epochs
Epoch: 02 | Time: 0m 25s
	Train Loss: 7.942 | Train PPL: 2813.712
	 Val. Loss: 12.040 |  Val. PPL: 169388.496
File ID: 1VWq_Mr5ugtUPDR2qFi5webKi_Qr1QRKl


In [0]:
target_file = drive.CreateFile({'id': '115tf274P47ZyBj1EdfeOpPWIK6hDwCnl'})
target_file.GetContentFile('./target.out') 
    
source_file = drive.CreateFile({'id': '1gAvrzv6I6Y6Qr2j1Oah4lz4eIgHdiVW7'})
source_file.GetContentFile('./source.out') 
    
def sample_net(net, test_de):    
    preds = []
    
    with torch.no_grad():
        for i, sent in enumerate(test_de):
            src = torch.LongTensor(sent).unsqueeze(1).to(device) 
            src_len = torch.LongTensor([len(sent)])
    
            output, _ = net(src, src_len, None, 0)
 
            pred = torch.argmax(output, dim=2)            
            preds.append(pred)

    return preds
   
# INPUT_DIM = len(german['idx2word'])
# OUTPUT_DIM = len(english['idx2word'])
# ENC_EMB_DIM = 256
# DEC_EMB_DIM = 256
# ENC_HID_DIM = 512
# DEC_HID_DIM = 512
# ENC_DROPOUT = 0.3
# DEC_DROPOUT = 0.3
# PAD_IDX = 0
# SOS_IDX = 2
# EOS_IDX = 3

# attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
# enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
# dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

# model = Seq2Seq(enc, dec, PAD_IDX, SOS_IDX, EOS_IDX, device).to(device)
# model.load_state_dict(torch.load("packed-padded-a.pt"))

predictions = sample_net(model, german['dev'])

In [0]:
def print_samples(predictions):
    preds_tokenized = []  
    
    for pred in predictions:
        pred_sent = []
        w=1
        while w < len(pred) and pred[w].item() != 3:
            idx = pred[w].item()
            pred_sent.append(english['idx2word'][idx])
            w+=1
        preds_tokenized.append(pred_sent)
    
    with open('pred.out', 'w') as f:
        for sent in preds_tokenized:
            y = " ".join(sent)
            f.write(y + '\n')
            
print_samples(predictions)

In [0]:
!sed -r -i 's/(@@ )|(@@ ?$)//g' pred.out

In [0]:
!git clone https://github.com/moses-smt/mosesdecoder.git
!pip install sacrebleu


fatal: destination path 'mosesdecoder' already exists and is not an empty directory.


In [0]:
%%shell
# !/bin/bash

# This is a reference to the gold translations from the dev set
REFERENCE_FILE="target.out"

# XXX: Change the following line to point to your model's output!
TRANSLATED_FILE="pred.out"

# The model output is expected to be in a tokenized form. Note, that if you
# tokenized your inputs to the model, then simply joined each output token with
# whitespace you should get tokenized outputs from your model.
# i.e. each output token is separate by whitespace
# e.g. "My model 's output is interesting ."
perl "mosesdecoder/scripts/tokenizer/detokenizer.perl" -l en < "$TRANSLATED_FILE" > "$TRANSLATED_FILE.detok"

PARAMS=("-tok" "intl" "-l" "de-en" "$REFERENCE_FILE")
sacrebleu "${PARAMS[@]}" < "$TRANSLATED_FILE.detok"

Detokenizer Version $Revision: 4134 $
Language: en
BLEU+case.mixed+lang.de-en+numrefs.1+smooth.exp+tok.intl+version.1.3.2 = 19.4 52.0/25.1/13.8/7.9 (BP = 1.000 ratio = 1.015 hyp_len = 163383 ref_len = 160956)




In [0]:
torch.save(model.state_dict(), 'model-bleu-17.5.pt')
