In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import transformers
from transformers import AutoTokenizer

In [None]:
#Vectorize the data.
batch_size = 64
latent_dim = 256
num_samples = 10000

#####
input_texts = []
target_texts = []
input_chars = set()
target_chars = set()

with open('/content/sample_data/fra.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text = line.split('\t')
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_chars:
            input_chars.add(char)
    for char in target_text:
        if char not in target_chars:
            target_chars.add(char)

input_chars = sorted(list(input_chars))
target_chars = sorted(list(target_chars))
num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(target_chars)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

#Print size
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 10000
Number of unique input tokens: 71
Number of unique output tokens: 93
Max sequence length for inputs: 16
Max sequence length for outputs: 59


In [None]:
import re
regex = "(\\t|\\n)"
re.sub(regex, "","\tVoyez ci-dessus.\n")

'Voyez ci-dessus.'

In [None]:
regex = "(\\t|\\n|\\u202f!)"
def rem_tokens(string):
  return re.sub(regex,"", string)



In [None]:
processed_text = list(map(rem_tokens, target_texts))

In [None]:
len(processed_text)

10000

In [None]:
def dict_conversion(inp):
  return {"en": inp[0], "fr":inp[1]}

In [None]:
dict_conversion(("good morning", "bonjour"))

{'en': 'good morning', 'fr': 'bonjour'}

In [None]:
zipped = list(zip(input_texts, processed_text))
dataset = list(map(dict_conversion, zipped))



In [None]:
dataset

[{'en': 'Go.', 'fr': 'Va !'},
 {'en': 'Run!', 'fr': 'Cours'},
 {'en': 'Run!', 'fr': 'Courez'},
 {'en': 'Wow!', 'fr': 'Ça alors'},
 {'en': 'Fire!', 'fr': 'Au feu !'},
 {'en': 'Help!', 'fr': "À l'aide"},
 {'en': 'Jump.', 'fr': 'Saute.'},
 {'en': 'Stop!', 'fr': 'Ça suffit'},
 {'en': 'Stop!', 'fr': 'Stop'},
 {'en': 'Stop!', 'fr': 'Arrête-toi !'},
 {'en': 'Wait!', 'fr': 'Attends !'},
 {'en': 'Wait!', 'fr': 'Attendez !'},
 {'en': 'I see.', 'fr': 'Je comprends.'},
 {'en': 'I try.', 'fr': "J'essaye."},
 {'en': 'I won!', 'fr': "J'ai gagné !"},
 {'en': 'I won!', 'fr': "Je l'ai emporté !"},
 {'en': 'Oh no!', 'fr': 'Oh non !'},
 {'en': 'Attack!', 'fr': 'Attaque !'},
 {'en': 'Attack!', 'fr': 'Attaquez !'},
 {'en': 'Cheers!', 'fr': 'Santé !'},
 {'en': 'Cheers!', 'fr': 'À votre santé !'},
 {'en': 'Cheers!', 'fr': 'Merci !'},
 {'en': 'Cheers!', 'fr': 'Tchin-tchin !'},
 {'en': 'Get up.', 'fr': 'Lève-toi.'},
 {'en': 'Got it!', 'fr': "J'ai pigé !"},
 {'en': 'Got it!', 'fr': 'Compris !'},
 {'en': 'Got it?

In [None]:
import transformers
print(transformers.__version__)

4.26.0


In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-ro"

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install sacrebleu
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_metric

metric = load_metric("sacrebleu")

  metric = load_metric("sacrebleu")


In [None]:
fake_preds = ["hello there", "general kenobi"]
fake_labels = [["hello there"], ["general kenobi"]]
metric.compute(predictions=fake_preds, references=fake_labels)

{'score': 0.0,
 'counts': [4, 2, 0, 0],
 'totals': [4, 2, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'bp': 1.0,
 'sys_len': 4,
 'ref_len': 4}

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)olve/main/source.spm:   0%|          | 0.00/789k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/817k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



In [None]:
if "mbart" in model_checkpoint:
    tokenizer.src_lang = "en-XX"
    tokenizer.tgt_lang = "ro-RO"

In [None]:
tokenizer("Hello, this one sentence!")

{'input_ids': [125, 778, 3, 63, 141, 9191, 23, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[125, 778, 3, 63, 141, 9191, 23, 0], [187, 32, 716, 9191, 2, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [None]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "translate English to French: "
else:
    prefix = ""

In [None]:
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "fr"

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples]
    targets = [ex[target_lang] for ex in examples]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
source_lang = "en"
target_lang = "fr"
def preprocess_func(examples):
  inputs = [prefix + ex[source_lang] for ex in examples]
  targets = [ex[target_lang] for ex in examples]
  model_inputs = tokenizer(inputs, max_length =  max_input_length, truncation = True)
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length = max_target_length, truncation = True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
import numpy as np

In [None]:
np_dataset = np.array(dataset)

In [None]:
np_dataset

array([{'en': 'Go.', 'fr': 'Va !'}, {'en': 'Run!', 'fr': 'Cours'},
       {'en': 'Run!', 'fr': 'Courez'}, ...,
       {'en': "I'm the surgeon.", 'fr': 'Je suis le chirurgien.'},
       {'en': "I'm the teacher.", 'fr': 'Je suis le professeur.'},
       {'en': "I'm the teacher.", 'fr': 'Je suis la professeur.'}],
      dtype=object)

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

In [None]:
tokenized_dataset = 

In [None]:
tokenized_dataset = preprocess_func(dataset)



In [None]:
len(tokenized_dataset['input_ids'])

10000

In [None]:
len(tokenized_dataset['labels'])

10000

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
print(model_name)

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    #fp16=True,
    push_to_hub=True,
)

opus-mt-en-ro


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
from sklearn.utils import shuffle

X = tokenized_dataset['input_ids']
y = tokenized_dataset['labels']

X,y = shuffle(X,y, random_state = 0)

In [None]:
train_data = {"input_ids": X[:8000], "labels": y[:8000]}
val_data = {"input_ids":X[8000:9000], "labels":y[8000:9000]}
test_data = {"input_ids": X[9000:], "labels":y[9000:]}

In [None]:
# final_data = {"input_ids": X, "labels":y}
# final_data

In [None]:
!pip install huggingface_hub
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/tphuntsho/opus-mt-en-ro-finetuned-en-to-fr into local empty directory.


In [None]:
trainer.train()

***** Running training *****
  Num examples = 2
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1
  Number of trainable parameters = 74624512


KeyError: ignored

In [None]:
def change_transformers_dataset_2_right_format(dataset, label_name):
   return dataset.map(lambda example: {'label': example[label_name]}, remove_columns=[label_name])

AttributeError: ignored

[[3153, 233, 7, 21, 243, 772, 124, 218, 296, 21965, 145, 2, 0],
 [14239, 532, 28565, 26632, 2, 0],
 [1610, 8, 1040, 25, 1079, 2, 0],
 [453, 1807, 8, 9, 25, 9, 430, 1325, 13619, 5, 11, 5617, 15, 6, 0],
 [162, 7, 1118, 227, 137, 627, 2, 0]]