In [6]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "unicamp-dl/translation-pt-en-t5"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")

model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [30]:
from transformers import pipeline

task = "text2text-generation"

translator = pipeline(task, model=model_name, tokenizer=tokenizer, max_new_tokens=2048, device="cuda")

translator

<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline at 0x7c69a85c5fd0>

In [42]:
translator("translate Portuguese to English: Nada de chegar o meu pedido.")

[{'generated_text': "I don't have to arrive at my request."}]

In [9]:
import polars as pl

from minio import Minio

client = Minio(endpoint="storage.io", access_key="admin", secret_key="password", secure=False)

response = client.get_object("ecommerce", "raw/order_reviews.csv")

df = pl.read_csv(response.read())

df

review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
str,str,i64,str,str,str,str
"""7bc2406110b926393aa56f80a40eba…","""73fc7af87114b39712e6da79b0a377…",4,,,"""2018-01-18 00:00:00""","""2018-01-18 21:46:59"""
"""80e641a11e56f04c1ad469d5645fdf…","""a548910a1c6147796b98fdf73dbeba…",5,,,"""2018-03-10 00:00:00""","""2018-03-11 03:05:13"""
"""228ce5500dc1d8e020d8d1322874b6…","""f9e4b658b201a9f2ecdecbb34bed03…",5,,,"""2018-02-17 00:00:00""","""2018-02-18 14:36:24"""
"""e64fb393e7b32834bb789ff8bb3075…","""658677c97b385a9be170737859d351…",5,,"""Recebi bem antes do prazo esti…","""2017-04-21 00:00:00""","""2017-04-21 22:02:06"""
"""f7c4243c7fe1938f181bec41a392bd…","""8e6bfb81e283fa7e4f11123a3fb894…",5,,"""Parabéns lojas lannister adore…","""2018-03-01 00:00:00""","""2018-03-02 10:26:53"""
…,…,…,…,…,…,…
"""574ed12dd733e5fa530cfd4bbf39d7…","""2a8c23fee101d4d5662fa670396eb8…",5,,,"""2018-07-07 00:00:00""","""2018-07-14 17:18:30"""
"""f3897127253a9592a73be9bdfdf4ed…","""22ec9f0669f784db00fa86d035cf86…",5,,,"""2017-12-09 00:00:00""","""2017-12-11 20:06:42"""
"""b3de70c89b1510c4cd3d0649fd3024…","""55d4004744368f5571d1f590031933…",5,,"""Excelente mochila, entrega sup…","""2018-03-22 00:00:00""","""2018-03-23 09:10:43"""
"""1adeb9d84d72fe4e337617733eb851…","""7725825d039fc1f0ceb7635e3f7d92…",4,,,"""2018-07-01 00:00:00""","""2018-07-02 12:59:13"""


In [10]:
from datasets import load_dataset

df.head(200).write_csv("/tmp/order_reviews.head.csv")

dataset = load_dataset("csv", data_files="/tmp/order_reviews.head.csv")

dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['review_id', 'order_id', 'review_score', 'review_comment_title', 'review_comment_message', 'review_creation_date', 'review_answer_timestamp'],
        num_rows: 200
    })
})

In [43]:
def translate(row):
    title = row["review_comment_title"]
    message = row["review_comment_message"]

    prompt = "translate Portuguese to English: {text}"
    translated_title = title and translator(prompt.format(text=title))[0]["generated_text"]
    translated_message = message and translator(prompt.format(text=message))[0]["generated_text"]

    row["translated_review_comment_title"] = translated_title
    row["translated_review_comment_message"] = translated_message

    return row

In [44]:
translated_dataset = dataset["train"].map(translate)

translated_dataset

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['review_id', 'order_id', 'review_score', 'review_comment_title', 'review_comment_message', 'review_creation_date', 'review_answer_timestamp', 'translated_review_comment_title', 'translated_review_comment_message'],
    num_rows: 200
})

In [45]:
pl.Config.set_fmt_str_lengths(2000)
tdf = pl.DataFrame(translated_dataset.to_pandas())
tdf[["review_comment_message", "translated_review_comment_message"]].drop_nulls()

review_comment_message,translated_review_comment_message
str,str
"""Recebi bem antes do prazo estipulado.""","""I received it well before the stipulated deadline.."""
"""Parabéns lojas lannister adorei comprar pela Internet seguro e prático Parabéns a todos feliz Páscoa""","""Congratulations to lannister stores I loved to buy the Internet safe and practical Congratulations to all happy Easter."""
"""aparelho eficiente. no site a marca do aparelho esta impresso como 3desinfector e ao chegar esta com outro nome...atualizar com a marca correta uma vez que é o mesmo aparelho""","""the website the brand of the device is printed as 3 disinfector and arrived with another name... update with the correct brand since it is the same device."""
"""Mas um pouco ,travando...pelo valor ta Boa. ""","""But a little , locking...by the value ta Good.."""
"""Vendedor confiável, produto ok e entrega antes do prazo.""","""Reliable seller, product ok and delivery before deadline.."""
"""GOSTARIA DE SABER O QUE HOUVE, SEMPRE RECEBI E ESSA COMPRA AGORA ME DECPCIONOU""","""I would like to know what I have, If I have any request, and I would get to know what I have now."""
"""Péssimo""","""Very difficult."""
"""Loja nota 10""","""Store note 10."""
"""obrigado pela atençao amim dispensada""","""thank you for the attention given again."""
"""A compra foi realizada facilmente. A entrega foi efetuada muito antes do prazo dado. O produto já começou a ser usado e até o presente, sem problemas.""","""The purchase was easily made. The delivery was made long before the deadline. The product has already begun to be used and until now, it has been all the problems."""
