In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import os
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
!pip install -U accelerate>=0.20.1
!pip install --upgrade accelerate

[0m

In [3]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")


In [4]:
! pip install datasets transformers[sentencepiece] sacrebleu
!apt install git-lfs

Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.7.0 sacrebleu-2.3.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 67 not upgraded.


In [5]:
from datasets import load_metric

metric = load_metric("sacrebleu")

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [6]:
import pandas as pd

# Create a DataFrame with two columns, "English" and "Paraphrase".
df = pd.read_csv("/kaggle/input/para-100/para_100k (1).csv")
df_valid = pd.read_csv("/kaggle/input/para-100/test(1).csv")

# Create a new column named "Translate".
df["Translate"] = df[["eng", "para"]].apply(lambda x: dict(zip(["eng", "para"], x)), axis=1)
df_valid["Translate"] = df_valid[["eng", "para"]].apply(lambda x: dict(zip(["eng", "para"], x)), axis=1)
len(df_valid)

100

In [7]:
df = df.drop(["eng","Unnamed: 0", "para"], axis=1)
df = df.dropna()
df_valid = df_valid.drop(["eng","Unnamed: 0", "para"], axis=1)
df_valid = df_valid.dropna()

df

Unnamed: 0,Translate
0,{'eng': 'the second requirement is the one at ...
1,{'eng': 'he even brought his favorite buddy be...
2,{'eng': 'move it line up in front of mrs bowa'...
3,{'eng': 'the priest and the mage are playing g...
4,{'eng': 'more constructs in one place than had...
...,...
99995,{'eng': 'it was night and the lanterns were li...
99996,{'eng': 'in the interests of clarity that regu...
99997,{'eng': 'mara gasped for breath but none came ...
99998,{'eng': 'but you are right mandy bronson does ...


In [8]:
from datasets import load_dataset,Dataset,DatasetDict

data_train = Dataset.from_pandas(df)
data_valid = Dataset.from_pandas(df_valid)
data_train = DatasetDict({"train":data_train, "validation": data_valid})


In [9]:
max_input_length = 512
max_target_length = 512
source_lang = "eng"
target_lang = "para"

def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["Translate"]]
    targets = [ex[target_lang] for ex in examples["Translate"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length,padding = "max_length", truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length,padding = "max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Tri1/12-18-finetuned-eng-to-para")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

In [11]:
tokenized_datasets = data_train.map(preprocess_function, batched=True)

  0%|          | 0/100 [00:00<?, ?ba/s]



  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
tokenized_datasets = tokenized_datasets.remove_columns(["Translate"])
tokenized_datasets.set_format("torch")

In [13]:
train_dataloader = DataLoader(tokenized_datasets, shuffle=True, batch_size=32)

In [None]:
!pip install -U accelerate>=0.20.1
!pip install --upgrade accelerate

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model_checkpoint = "Tri1/12-18-finetuned-eng-to-para"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 16
model_name = "18-24"
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=6,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
from huggingface_hub import notebook_login
from huggingface_hub import login
login(token = "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")
notebook_login()
import wandb
wandb.login(key="cbcabc061fb4a62d6ebeae24db563b71d7747fb6")



In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()