In [1]:
!pip install transformers
!pip install sentencepiece
!pip install datasets



In [2]:
!unzip COMBINED.zip

Archive:  COMBINED.zip
replace COMBINED/train.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [3]:
import tensorflow as tf
tf.__version__

'2.8.0'

In [4]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [6]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import create_optimizer, AdamWeightDecay
from transformers import TFAutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq


In [7]:
df_test = pd.read_csv('COMBINED/test.tsv', sep = '\t', names=['in', 'expected'])
df_test

Unnamed: 0,in,expected
0,"PCCW's chief operating officer, Mike Butcher, ...",Current Chief Operating Officer Mike Butcher a...
1,The world's two largest automakers said their ...,Domestic sales at both GM and No. 2 Ford Motor...
2,According to the federal Centers for Disease C...,The Centers for Disease Control and Prevention...
3,A tropical storm rapidly developed in the Gulf...,A tropical storm rapidly developed in the Gulf...
4,The company didn't detail the costs of the rep...,But company officials expect the costs of the ...
...,...,...
5172,Twice Sparrow sold the island twice to Thomas ...,Sparrow twice sold the island to Thomas Polloc...
5173,The name in Tupi means `` insensitive stone ''...,"The name in Tupi means '' hard stone `` , '' i..."
5174,"The company has branches in Tokyo , based in t...",The company has branches in Tokyo based in Sai...
5175,The modern coat of arms of Bavaria was designe...,The modern coat of arms of Bavaria was designe...


In [8]:
df_train = pd.read_csv('COMBINED/train.tsv', sep = '\t', names=['in', 'expected'])
df_train

Unnamed: 0,in,expected
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi..."
1,"Referring to him as only ""the witness"", Amrozi...","Amrozi accused his brother, whom he called ""th..."
2,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
3,Yucaipa bought Dominick's in 1995 for $693 mil...,Yucaipa owned Dominick's before selling the ch...
4,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
...,...,...
58613,"Tommy Connolly , who plays Rory Jennings , pla...","Tommy Connolly , who plays Rory Jennings , pla..."
58614,"Monroe Meadows , in Yosemite valley near Brida...","Monroe Meadows , in Yosemite Valley near Brida..."
58615,"Monroe Meadows , in Yosemite Valley near Brida...","Monroe Meadows , in Yosemite valley near Brida..."
58616,In 2014 the site launched iOS and Android appl...,In 2014 launched the site iOS and Android - ap...


In [9]:
dataset_test = Dataset.from_pandas(df_test)
dataset_train = Dataset.from_pandas(df_train)

In [57]:
init_weights = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(init_weights)

def preprocess_function(examples):
    prefix = 'paraphrase: '
    inputs = [prefix + doc for doc in examples["in"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["expected"], max_length=512, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_test = dataset_test.map(preprocess_function, batched=True)
tokenized_train = dataset_train.map(preprocess_function, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/59 [00:00<?, ?ba/s]

In [58]:
tokenized_train_small = tokenized_train.shuffle(seed=42).select(range(30000))
tokenized_test_small = tokenized_test.shuffle(seed=42).select(range(1500))

In [59]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(init_weights)
optimizer = AdamWeightDecay(learning_rate=1e-4, weight_decay_rate=0.01)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [60]:
tf_train_set = tokenized_train_small.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)
tf_test_set = tokenized_test_small.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [61]:
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [62]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=1)




<keras.callbacks.History at 0x7f76d58b5610>

In [67]:
model.save_pretrained('checkpoints/t5savedv4')


In [75]:
!zip -r 'checkpoints.zip' 'checkpoints/'

  adding: checkpoints/ (stored 0%)
  adding: checkpoints/t5savedv4/ (stored 0%)
  adding: checkpoints/t5savedv4/tf_model.h5 (deflated 9%)
  adding: checkpoints/t5savedv4/config.json (deflated 63%)
  adding: checkpoints/t5saved/ (stored 0%)
  adding: checkpoints/t5saved/tf_model.h5 (deflated 8%)
  adding: checkpoints/t5saved/config.json (deflated 63%)
  adding: checkpoints/t5savedv2/ (stored 0%)
  adding: checkpoints/t5savedv2/tf_model.h5 (deflated 8%)
  adding: checkpoints/t5savedv2/config.json (deflated 63%)
  adding: checkpoints/t5savedv3/ (stored 0%)
  adding: checkpoints/t5savedv3/tf_model.h5 (deflated 8%)
  adding: checkpoints/t5savedv3/config.json (deflated 62%)


In [70]:
model = TFAutoModelForSeq2SeqLM.from_pretrained('checkpoints/t5savedv4')


All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at checkpoints/t5savedv4.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
# inference

In [74]:
for idx, row in df_test.head(15).iterrows():
    print('Input: ', row['in'])
    to_model = 'paraphrase: ' + row['in']

    # sentence = 'paraphrase: We should go to the movies today because it is raining.'
    encoding = tokenizer(row['in'], return_tensors="tf")
    input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
    out = model.generate(input_ids=input_ids, do_sample=True, attention_mask=attention_masks, max_length=512,
                         top_k=250, top_p=0.99, early_stopping=True, num_return_sequences=5)
    # print(out)
    for p in out:
      result = tokenizer.decode(p, skip_special_tokens=True)
      print('Prediction: ', result)
    print('Expected: ', row['expected'])
    print()

Input:  PCCW's chief operating officer, Mike Butcher, and Alex Arena, the chief financial officer, will report directly to Mr So.
Prediction:  Mike Butcher, chief operating officer of PCCW, and Alex Arena, chief financial officer, will report directly to Mr So.
Prediction:  Mike Butcher, PCCW's chief operating officer, and Alex Arena, the chief financial officer, will head the finance department to him.
Prediction:  Mike Butcher, PCCW's chief operating officer, and Alex Arena, the chief financial officer, will report directly to Mr. So.
Prediction:  The executive of PCCW, Mike Butcher, and Mike Butcher, Chief Financial Officer of PCCW, will report directly to Mr So.
Prediction:  Mike Butcher, chief operating officer of PCCW, and Alex Arena, chief financial officer of PCCW, will erected direct reports for you.
Expected:  Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So.

Input:  The world's two largest automakers said their U.S.