In [None]:
!pip install transformers[sentencepiece] datasets evaluate sacremoses

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from tqdm import tqdm
import os

In [None]:
!wget https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en -O train_en.txt

In [None]:
!wget https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de -O train_de.txt

In [None]:
with open("./train_en.txt") as f:
    lines_en = f.readlines()
    
with open("./train_de.txt") as f:
    lines_de = f.readlines()
    
print(len(lines_en), len(lines_de))

In [None]:
ids = []
en = []
de = []

count = 0
start_index = 50000
min_len = 20
max_len = 50
total_sample = 70000

for idx, line in enumerate(lines_de[start_index:]):
    
    if (len(line.split(" ")) > min_len) and (len(line.split(" ")) < max_len) :
        ids.append(idx + start_index)
        count += 1
        de.append(line)
        en.append(lines_en[idx + start_index])
    
    if count > total_sample:
        break


In [None]:
en = [e.replace("##AT##-##AT##", "") for e in en]
de = [d.replace("##AT##-##AT##", "") for d in de]

In [None]:
input_texts = de
original_english = en

In [None]:
BATCH_SIZE = 32

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-de-en")

In [None]:
model.to(device)
print("done")

In [None]:
iterations = len(input_texts) // BATCH_SIZE 
if len(input_texts) % BATCH_SIZE != 0:
    iterations += 1 

infos = []
# iterations=10

for iter in tqdm(range(iterations)):

    if (iter+1) * BATCH_SIZE >= len(input_texts):
        start = iter * BATCH_SIZE
        end = len(input_texts)
    else:
        start = iter * BATCH_SIZE
        end = (iter+1) * BATCH_SIZE 
    
    batched_text = input_texts[start:end]
    batched_tokens = tokenizer(batched_text, return_tensors="pt", padding=True).to("cuda")
    
    if batched_tokens['input_ids'].shape[1] > 512:
        print("Large Sample Encountered")
        continue

    with torch.no_grad():
        generated_ids = model.generate(**batched_tokens, num_beams=5, num_return_sequences=1)
    
    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 


    for txt in original_english[start:end]:
        infos.append(
            {
                "text":txt,
                "label":0,
                "source_lan":"de"
            }
        )
    
    for txt in generated_texts:
        infos.append(
            {
                "text":txt,
                "label":1,
                "source_lan":"de"
            }
        )
    # break

In [None]:
df_final = pd.DataFrame(infos)

In [None]:
len(df_final)

In [None]:
df_final.to_csv("./labelled_data_from_de.csv", index=False)