In [None]:
!pip install torchtext==0.8.0
!pip install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2
!pip install ftfy --quiet
!pip install transformers --quiet 
!pip install sentencepiece --quiet
!git clone https://github.com/unicamp-dl/cross-lingual-analysis.git

In [None]:
import pandas as pd
import spacy
from spacy.lang.en import English
from tqdm.notebook import tqdm
from transformers import MarianMTModel, MarianTokenizer
from ftfy import fix_encoding
import ftfy

import torch
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu" 
print(dev, torch.cuda.get_device_name(0))
device = torch.device(dev)

# Model
model_name = 'Helsinki-NLP/opus-mt-ROMANCE-en'
marian_tokenizer = MarianTokenizer.from_pretrained(model_name)
marian_model = MarianMTModel.from_pretrained(model_name)

In [6]:
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
def chunkstring_spacy(text):
    """
    Segment text and prepare to translation

    Args:
      text: Sentence to be translated
      
    Returns:
      Segmented text.
    """
    chunck_sentences = []
    doc = nlp(str(text))
    for sent in doc.sents:
        chunck_sentences.append('>>en<<' + ' ' + sent.text)
        
    return chunck_sentences

def translate(aux_sent):
    """
    Translate text

    Args:
      aux_sent: Sentence to be translated
      
    Returns:
      Translated text.
    """
    max_length = 512
    num_beams = 1

    sentence = chunkstring_spacy(aux_sent)

    #Move o modelo para a GPU
    marian_model.to(device)
    marian_model.eval()

    tokenized_text = marian_tokenizer.prepare_seq2seq_batch(sentence, max_length=max_length, return_tensors='pt')
                        
    translated = marian_model.generate(input_ids=tokenized_text['input_ids'].to(device), 
                                        max_length=max_length, 
                                        num_beams=num_beams, 
                                        early_stopping=True, 
                                        do_sample=False)
                        
    tgt_text = [fix_encoding(marian_tokenizer.decode(t, skip_special_tokens=True)) for t in translated]
    return ' '.join(tgt_text)

def translate_assin(input):
    """
    Translate ASSIN2 test set to English
    Args:
      input: Dataset to be translated

    Returns:
      CSV containing the translated dataset.
    """
    print('Translating ASSIN2 ...')
    df = pd.read_csv('{}'.format(input), encoding='utf-8')

    lista = df['t'].tolist()
    lista2 = df['h'].tolist()
    label = df['_entailment'].tolist()
    id = df['_id'].tolist()
    similarity = df['_similarity'].tolist()

    list_sent_1 = []
    list_sent_2 = []
    list_label = []
    list_id = []
    list_sim = []
    for sent1, sent2, l, i, s in zip(tqdm(lista), lista2, label, id, similarity): 
        sent = str(sent1) + '. ' + str(sent2)
        saida_sent = translate(sent)
        new_saida_sent = saida_sent.replace('.','').replace('-','')
  
        try:
            new_sent1 = new_saida_sent.split('  ')[0]
            new_sent2 = new_saida_sent.split('  ')[1]
            list_sent_1.append(new_sent1)
            list_sent_2.append(new_sent2)
            list_label.append(l)
            list_id.append(i)
            list_sim.append(s)
        except:  
            pass      

    df_final = pd.DataFrame(columns = df.columns)
    df_final['t'] = list_sent_1
    df_final['h'] = list_sent_2
    df_final['_entailment'] = list_label
    df_final['_id'] = list_id
    df_final['_similarity'] = list_sim

    df_final.to_csv('/content/assin2-translated.csv')

### Run code

In [8]:
input = '/content/cross-lingual-analysis/data/assin2-test.csv'
translate_assin(input)