In [None]:
!pip install transformers[sentencepiece] datasets evaluate sacremoses

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from tqdm import tqdm
import os

In [None]:
!pip install gdown --upgrade --no-cache-dir

In [None]:
!gdown https://drive.google.com/uc?id=1mO1IjTrRCcpELam4LjHybP4vTlCIQtKy

In [None]:
df = pd.read_csv("/kaggle/input/kaggle-22m-enfr/kaggle_22mil.csv")
df.head()

In [None]:
input_texts = df.dest_text.values.tolist()
original_english = df.source_text.values.tolist()

In [None]:
BATCH_SIZE = 32

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en")

In [None]:
model.to(device)
print("done")

In [None]:
iterations = len(input_texts) // BATCH_SIZE 
if len(input_texts) % BATCH_SIZE != 0:
    iterations += 1 

infos = []

for iter in tqdm(range(iterations)):

    if (iter+1) * BATCH_SIZE >= len(input_texts):
        start = iter * BATCH_SIZE
        end = len(input_texts)
    else:
        start = iter * BATCH_SIZE
        end = (iter+1) * BATCH_SIZE 
    
    batched_text = input_texts[start:end]
    batched_tokens = tokenizer(batched_text, return_tensors="pt", padding=True).to("cuda")
    
    if batched_tokens['input_ids'].shape[1] > 512:
        print("Large Sample Encountered")
        continue

    with torch.no_grad():
        generated_ids = model.generate(**batched_tokens, num_beams=5, num_return_sequences=1)
    
    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 


    for txt in original_english[start:end]:
        infos.append(
            {
                "text":txt,
                "label":0
            }
        )
    
    for txt in generated_texts:
        infos.append(
            {
                "text":txt,
                "label":1
            }
        )

In [None]:
df_final = pd.DataFrame(infos)

In [None]:
len(df_final)

In [None]:
df_final.to_csv("./labelled_data_from_fr.csv", index=False)