## Install necessary packages

In [1]:
!pip install transformers datasets 

[0m

In [2]:
!pip install rouge.score nltk py7zr

Collecting rouge.score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting py7zr
  Downloading py7zr-0.20.2-py3-none-any.whl (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.8/65.8 kB[0m [31m774.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pycryptodomex>=3.6.6
  Downloading pycryptodomex-3.15.0-cp35-abi3-manylinux2010_x86_64.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting multivolumefile>=0.2.3
  Downloading multivolumefile-0.2.3-py3-none-any.whl (17 kB)
Collecting pybcj>=0.6.0
  Downloading pybcj-1.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyzstd>=0.14.4
  Downloading pyzstd-0.15.3-cp37-cp37m-manylinux_2_17_x86_6

# Fine tune with transformers

In [3]:
import transformers
from datasets import load_dataset, load_metric, load_from_disk
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Loading dataset error

Sometimes when loading the dataset wile in GPU enviorment it will give the error that it cannot find the *samsum* dataset. The workaround is to load the dataset while in CPU mode then save it localy or on you drive. After that just switch back to GPU and load the dataset from the local file using *load_from_disk()*

In [4]:
data = load_dataset('multi_news')
#data.save_to_disk('/content/samsum')
#data = load_from_disk("/content/drive/MyDrive/samsum")
metric = load_metric('rouge')
model_checkpoints = 'facebook/bart-large-xsum'

Downloading builder script:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/932 [00:00<?, ?B/s]

Downloading and preparing dataset multi_news/default (download: 245.06 MiB, generated: 667.72 MiB, post-processed: Unknown size, total: 912.78 MiB) to /root/.cache/huggingface/datasets/multi_news/default/1.0.0/2e145a8e21361ba4ee46fef70640ab946a3e8d425002f104d2cda99a9efca376...


Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/44972 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Dataset multi_news downloaded and prepared to /root/.cache/huggingface/datasets/multi_news/default/1.0.0/2e145a8e21361ba4ee46fef70640ab946a3e8d425002f104d2cda99a9efca376. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

## Data tokenization

**max_input** and **max_target** can variy depending on the available computing power

In [5]:
max_input = 512
max_target = 128
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoints)

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [6]:
def preprocess_data(data_to_process):
  #get the dialogue text
  inputs = [dialogue for dialogue in data_to_process['document']]
  #tokenize text
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)

  #tokenize labels
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['summary'], max_length=max_target, padding='max_length', truncation=True)
    
  model_inputs['labels'] = targets['input_ids']
  #reuturns input_ids, attention_masks, labels
  return model_inputs

In [7]:
tokenize_data = data.map(preprocess_data, batched = True, remove_columns=['document', 'summary'])

  0%|          | 0/45 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

## If memory problems

- sample data to smaller sizes

In [8]:
#sample the data
train_sample = tokenize_data['train'].shuffle(seed=123).select(range(1000))
validation_sample = tokenize_data['validation'].shuffle(seed=123).select(range(500))
test_sample = tokenize_data['test'].shuffle(seed=123).select(range(200))

In [9]:
tokenize_data['train'] = train_sample
tokenize_data['validation'] = validation_sample
tokenize_data['test'] = test_sample

In [10]:
tokenize_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

## Training process

In [11]:
#load model
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Depending on computing power, batch size can go as low as 1 if necessary

In [12]:
batch_size = 1

In [13]:
#collator to create batches. It preprocess data with the given tokenizer
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [14]:
#####################
# metrics
# compute rouge for evaluation 
#####################

def compute_rouge(pred):
  predictions, labels = pred
  #decode the predictions
  decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  #decode labels
  decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  #compute results
  res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
  #get %
  res = {key: value.mid.fmeasure * 100 for key, value in res.items()}

  pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  res['gen_len'] = np.mean(pred_lens)

  return {k: round(v, 4) for k, v in res.items()}

In [15]:
args = transformers.Seq2SeqTrainingArguments(
    'conversation-summ',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size= 1,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    predict_with_generate=True,
    eval_accumulation_steps=1,
    fp16=True
    )
#only CUDA available -> fp16=True

In [16]:
trainer = transformers.Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

Using cuda_amp half precision backend


In [17]:
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 2
  Total optimization steps = 5000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.5438,2.369417,33.6152,12.1644,20.5924,20.607,61.966
2,1.8974,2.393723,34.1313,12.0167,20.6451,20.6264,61.972
3,1.4301,2.533672,33.4087,11.5914,20.3243,20.292,62.0
4,1.0596,2.78959,32.8686,11.105,19.9538,19.9386,62.0
5,0.7787,3.033906,33.252,11.2928,20.1062,20.0829,62.0
6,0.5598,3.272082,32.8982,10.9329,19.6514,19.6237,61.982
7,0.4073,3.484862,32.8605,10.5533,19.4988,19.4989,61.992
8,0.3032,3.679194,33.0673,10.9468,19.7503,19.746,62.0
9,0.2359,3.756907,32.7229,10.6749,19.5511,19.5647,61.972
10,0.1937,3.820121,32.9602,10.7775,19.8253,19.8242,62.0


Saving model checkpoint to conversation-summ/checkpoint-500
Configuration saved in conversation-summ/checkpoint-500/config.json
Model weights saved in conversation-summ/checkpoint-500/pytorch_model.bin
tokenizer config file saved in conversation-summ/checkpoint-500/tokenizer_config.json
Special tokens file saved in conversation-summ/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 500
  Batch size = 1
Saving model checkpoint to conversation-summ/checkpoint-1000
Configuration saved in conversation-summ/checkpoint-1000/config.json
Model weights saved in conversation-summ/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in conversation-summ/checkpoint-1000/tokenizer_config.json
Special tokens file saved in conversation-summ/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 500
  Batch size = 1
Saving model checkpoint to conversation-summ/checkpoint-1500
Configuration saved in conversation-summ/checkpo

TrainOutput(global_step=5000, training_loss=0.9409502334594727, metrics={'train_runtime': 7705.5192, 'train_samples_per_second': 1.298, 'train_steps_per_second': 0.649, 'total_flos': 1.083552301056e+16, 'train_loss': 0.9409502334594727, 'epoch': 10.0})

## Testing the fine tuned model

In [18]:
conversation = """
Rann: Hey Harry, how have you been? Long time no see!
Harry: Hey! What a surprise! 
Harry: Yes, you are right, we haven’t seen each other in a long time. How have you been?
Rann: There is an important campaign next week which is keeping me busy otherwise rest is going good in my life. 
Rann: How about you?
Harry: Oh! I just finished a meeting with a very important client of mine and now I finally have some free time. I feel relieved that I’m done with it.
Rann: Good for you then. Hey! Let’s make a plan and catch up with each other after next week. 
Rann: What do you say?
Harry: Sure, why not? Give me a call when you are done with your project.
Rann: Sure, then. 
Rann: Bye, take care.
Harry: Bye buddy.
"""

In [19]:
model_inputs = tokenizer(conversation,  max_length=max_input, padding='max_length', truncation=True)

In [20]:
model_inputs

{'input_ids': [0, 50118, 500, 2279, 35, 11468, 3268, 6, 141, 33, 47, 57, 116, 2597, 86, 117, 192, 328, 50118, 29345, 35, 11468, 328, 653, 10, 2755, 328, 1437, 50118, 29345, 35, 3216, 6, 47, 32, 235, 6, 52, 2220, 17, 27, 90, 450, 349, 97, 11, 10, 251, 86, 4, 1336, 33, 47, 57, 116, 50118, 500, 2279, 35, 345, 16, 41, 505, 637, 220, 186, 61, 16, 2396, 162, 3610, 3680, 1079, 16, 164, 205, 11, 127, 301, 4, 1437, 50118, 500, 2279, 35, 1336, 59, 47, 116, 50118, 29345, 35, 5534, 328, 38, 95, 1550, 10, 529, 19, 10, 182, 505, 3653, 9, 4318, 8, 122, 38, 1747, 33, 103, 481, 86, 4, 38, 619, 15126, 14, 38, 17, 27, 119, 626, 19, 24, 4, 50118, 500, 2279, 35, 2497, 13, 47, 172, 4, 11468, 328, 2780, 17, 27, 29, 146, 10, 563, 8, 2916, 62, 19, 349, 97, 71, 220, 186, 4, 1437, 50118, 500, 2279, 35, 653, 109, 47, 224, 116, 50118, 29345, 35, 9136, 6, 596, 45, 116, 12192, 162, 10, 486, 77, 47, 32, 626, 19, 110, 695, 4, 50118, 500, 2279, 35, 9136, 6, 172, 4, 1437, 50118, 500, 2279, 35, 36255, 6, 185, 575, 4, 501

In [21]:
raw_pred, _, _ = trainer.predict([model_inputs])

***** Running Prediction *****
  Num examples = 1
  Batch size = 1


In [22]:
raw_pred

array([[    2,     0,     0,     0,  2383,   572,    10,   251,  5171,
            9,    55,    87,    10,   353,     6,    41,   470,  9716,
        17065,    11,     5,  2367,   953,  2312,     7,  1349,   159,
           39,   793, 23279,    11,   188,  3324,     8,  2755,   123,
           19,    10,  1028,   486,     4,  2595,     5, 12400,     6,
            5,  9716,     6,  2006,   129,    25,   248,  2279,     6,
         7173,     5,  1028,    71,  3357,   184,    71,     2]])

In [23]:
tokenizer.decode(raw_pred[0])

'</s><s><s><s>– After a long absence of more than a month, an American soldier stationed in the Middle East managed to track down his old buddy in New Zealand and surprise him with a phone call. Per the Telegraph, the soldier, identified only as Rann, answered the phone after returning home after</s>'

In [24]:
tokenizer.decode(raw_pred[0])

'</s><s><s><s>– After a long absence of more than a month, an American soldier stationed in the Middle East managed to track down his old buddy in New Zealand and surprise him with a phone call. Per the Telegraph, the soldier, identified only as Rann, answered the phone after returning home after</s>'