In [None]:
!pip install torchtext==0.8.0
!pip install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2
!pip install ftfy --quiet
!pip install transformers --quiet 
!pip install sentencepiece --quiet
! wget https://dl.fbaipublicfiles.com/glue/data/MNLI.zip
! unzip /content/MNLI.zip

In [None]:
import pandas as pd
import spacy
from spacy.lang.en import English
from tqdm.notebook import tqdm
from transformers import MarianMTModel, MarianTokenizer
from ftfy import fix_encoding
import ftfy
import re

import torch
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu" 
print(dev, torch.cuda.get_device_name(0))
device = torch.device(dev)

# Model
model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
marian_tokenizer = MarianTokenizer.from_pretrained(model_name)
marian_model = MarianMTModel.from_pretrained(model_name)

In [3]:
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
def chunkstring_spacy(text):
    """
    Segment text and prepare to translation

    Args:
      text: Sentence to be translated
      
    Returns:
      Segmented text.
    """
    chunck_sentences = []
    doc = nlp(str(text))
    for sent in doc.sents:
        chunck_sentences.append('>>pt_br<<' + ' ' + sent.text)
        
    return chunck_sentences

def translate(aux_sent):
    """
    Translate text

    Args:
      aux_sent: Sentence to be translated
      
    Returns:
      Translated text.
    """
    max_length = 512
    num_beams = 1

    sentence = chunkstring_spacy(aux_sent)

    #Move o modelo para a GPU
    marian_model.to(device)
    marian_model.eval()

    tokenized_text = marian_tokenizer.prepare_seq2seq_batch(sentence, max_length=max_length, return_tensors='pt')
                        
    translated = marian_model.generate(input_ids=tokenized_text['input_ids'].to(device), 
                                        max_length=max_length, 
                                        num_beams=num_beams, 
                                        early_stopping=True, 
                                        do_sample=False)
                        
    tgt_text = [fix_encoding(marian_tokenizer.decode(t, skip_special_tokens=True)) for t in translated]
    return ' '.join(tgt_text)

def MNLI_translate(input):
    """
    Translate MNLI train set to Portuguese
    Args:
      input: Dataset to be translated

    Returns:
      CSV containing the translated dataset.
    """
    df = pd.read_csv('{}'.format(input), sep='\t', header=0, error_bad_lines=False)
    print('Translating MNLI ...')
    sent_1 = df['sentence1'].tolist()
    sent_2 = df['sentence2'].tolist()

    index = df['index'].tolist()
    prompt = df['promptID'].tolist()
    pair = df['pairID'].tolist()
    genre = df['genre'].tolist()
    binary1 = df['sentence1_binary_parse'].tolist()
    binary2 = df['sentence2_binary_parse'].tolist()
    parse1 = df['sentence1_parse'].tolist()
    parse2 = df['sentence2_parse'].tolist()
    label = df['label1'].tolist()
    gold_label = df['gold_label'].tolist()

    list_tuples = []
   
    for sent1, sent2, i, p, p2, g, b, b2, p3, p4, l, gold in zip(tqdm(sent_1), sent_2, index, prompt, pair, genre, binary1, binary2, parse1, parse2, label, gold_label): 
        if not str(sent1).endswith('.'):
            sent1 = sent1+'.'
        sent = str(sent1) + ' ' + str(sent2)
        saida_sent = translate(sent)

        try:
            new_sent1 = saida_sent.split('. ')[0]
            new_sent2 = saida_sent.split('. ')[1]

            tuples = (i, p, p2, g, b, b2, p3, p4, new_sent1, new_sent2, l, gold)
            list_tuples.append(tuples)

        except:  
            pass      
      

    df_final = pd.DataFrame(list_tuples, columns = df.columns)

    df_final.to_csv('/content/MNLI-translated.csv')

### Run code

In [5]:
input = '/content/MNLI/train.tsv'
MNLI_translate(input)