### SETTING UP COLLAB ENVIRONMENT 

In [None]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install nltk
!pip install evaluate
!pip install pynvml 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m100.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, https://u

In [None]:
import transformers
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import numpy as np
import os
import nltk
import torch
import evaluate
import sys
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# define utils functions to facilitate gpu 

def check_gpu_availability():
    # Check if CUDA is available
    print(f"Cuda is available: {torch.cuda.is_available()}")

def getting_device(gpu_prefence=True) -> torch.device:
    """
    This function gets the torch device to be used for computations, 
    based on the GPU preference specified by the user.
    """
    
    # If GPU is preferred and available, set device to CUDA
    if gpu_prefence and torch.cuda.is_available():
        device = torch.device('cuda')
    # If GPU is not preferred or not available, set device to CPU
    else: 
        device = torch.device("cpu")
    
    # Print the selected device
    print(f"Selected device: {device}")
    
    # Return the device
    return device

# Define a function to print GPU memory utilization
def print_gpu_utilization():
    # Initialize the PyNVML library
    nvmlInit()
    # Get a handle to the first GPU in the system
    handle = nvmlDeviceGetHandleByIndex(0)
    # Get information about the memory usage on the GPU
    info = nvmlDeviceGetMemoryInfo(handle)
    # Print the GPU memory usage in MB
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

# Define a function to print training summary information
def print_summary(result):
    # Print the total training time in seconds
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    # Print the number of training samples processed per second
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    # Print the GPU memory utilization
    print_gpu_utilization()

In [None]:
# should be true
check_gpu_availability()

Cuda is available: True


In [None]:
# should be cuda
device = getting_device()

Selected device: cuda


In [None]:
# should be really small
print_gpu_utilization()

GPU memory occupied: 253 MB.


### GETTING DATA INTO DATASET

In [None]:
# loading data 
train_file = '/content/community_train.json'
dev_file = '/content/community_val.json'
test_file = '/content/community_test.json'

In [None]:
# transform it into a dataset
dataset = load_dataset('json', data_files={'train': train_file, 'valid': dev_file, 'test': test_file})



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-3ffcb07f41d930c3/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-3ffcb07f41d930c3/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# let's check the data
how = "Instances "
dataset['test'][0]['story']

'Once upon a time there was an influencer whose name was Margaery Tyrell, which was a Queen Consort of House Tyrell. When Daenerys Targaryen threatened the castle of Margaery Tyrell, she had a happy and peaceful life. Our hero was too weak, but with the help of Mace Tyrell, Margaery Tyrell learned the power of gravity control. On the summer solstice in Castle Black, Margaery Tyrell met Alerie Tyrell and formed an alliance. After traveling throughout all the kingdom, Margaery Tyrell finally found Daenerys Targaryen in the Bear Island with the help of Alerie Tyrell A bloody battle began but Margaery Tyrell got the better of Daenerys Targaryen thanks to gravity control powers. After the victory, the castle was no longer under threat. Margaery Tyrell and Alerie Tyrell celebrated by making fireworks at the Ruby Ford.'

In [None]:
dataset['test'][0][f'{how}Knowledge Graph']

'Final confrontation between hero and villain - Place5 - Bear Island | Margaery Tyrell - helpedby - Alerie Tyrell | Margaery Tyrell - fights - Daenerys Targaryen | Margaery Tyrell - hasHouse - House Tyrell | Margaery Tyrell - hasOccupation - an influencer | Margaery Tyrell - celebratesvictory - making fireworks | Margaery Tyrell - saves - the castle | Margaery Tyrell - meetsMentor - Mace Tyrell | Margaery Tyrell - hasTitle - Queen consort | The hero meets ally - Time4 - the Summer Solstice | Margaery Tyrell - meetsAlly - Alerie Tyrell | Daenerys Targaryen - Threatens - the castle | Final confrontation between hero and villain - Time5 - the Summer Solstice | Margaery Tyrell - powerLearned - gravity Control | The hero meets ally - Place4 - Castle Black | Margaery Tyrell - usespower - gravity Control | Margaery Tyrell - partywith - Alerie Tyrell | '

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['story', 'Instances Knowledge Graph', 'Class Knowledge Graph', 'Types Knowledge Graph', 'Range Knowledge Graph', 'Event Knowledge Graph', 'Ontology Knowledge Graph'],
        num_rows: 1000
    })
    valid: Dataset({
        features: ['story', 'Instances Knowledge Graph', 'Class Knowledge Graph', 'Types Knowledge Graph', 'Range Knowledge Graph', 'Event Knowledge Graph', 'Ontology Knowledge Graph'],
        num_rows: 100
    })
    test: Dataset({
        features: ['story', 'Instances Knowledge Graph', 'Class Knowledge Graph', 'Types Knowledge Graph', 'Range Knowledge Graph', 'Event Knowledge Graph', 'Ontology Knowledge Graph'],
        num_rows: 100
    })
})

### TOKENIZATION

In [None]:
max_target = np.max([len(nltk.word_tokenize(dataset['test'][i]['story'])) for i in range(50)])+50
max_input = np.max([len(nltk.word_tokenize(dataset['test'][i][f'{how}Knowledge Graph'])) for i in range(50)])+20
max_target, max_input    

(220, 183)

In [None]:
from transformers import LEDTokenizer
# model_checkpoints = 'facebook/bart-base'
tokenizer = AutoTokenizer.from_pretrained("t5-small")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
prompt = "Final confrontation between hero and villain - Place5 - Bear Island | Margaery Tyrell - helpedby - Alerie Tyrell | Margaery Tyrell - fights - Daenerys Targaryen | Margaery Tyrell - hasHouse - House Tyrell | Margaery Tyrell - hasOccupation - an influencer | Margaery Tyrell - celebratesvictory - making fireworks | Margaery Tyrell - saves - the castle | Margaery Tyrell - meetsMentor - Mace Tyrell | Margaery Tyrell - hasTitle - Queen consort | The hero meets ally - Time4 - the Summer Solstice | Margaery Tyrell - meetsAlly - Alerie Tyrell | Daenerys Targaryen - Threatens - the castle | Final confrontation between hero and villain - Time5 - the Summer Solstice | Margaery Tyrell - powerLearned - gravity Control | The hero meets ally - Place4 - Castle Black | Margaery Tyrell - usespower - gravity Control | Margaery Tyrell - partywith - Alerie Tyrell | "
prompt.split()
a = tokenizer(prompt.split(), is_split_into_words = True)
print(a["input_ids"])

[6514, 25704, 344, 160, 32, 11, 23132, 3, 18, 3399, 755, 3, 18, 9034, 2834, 1820, 16409, 9, 4203, 24652, 3820, 3, 18, 2139, 969, 3, 18, 901, 4074, 24652, 3820, 1820, 16409, 9, 4203, 24652, 3820, 3, 18, 2870, 7, 3, 18, 878, 35, 4203, 7, 5509, 1478, 63, 35, 1820, 16409, 9, 4203, 24652, 3820, 3, 18, 65, 4489, 1074, 3, 18, 1384, 24652, 3820, 1820, 16409, 9, 4203, 24652, 3820, 3, 18, 65, 667, 75, 4658, 257, 3, 18, 46, 2860, 52, 1820, 16409, 9, 4203, 24652, 3820, 3, 18, 4036, 7, 7287, 10972, 3, 18, 492, 23806, 1820, 16409, 9, 4203, 24652, 3820, 3, 18, 1097, 7, 3, 18, 8, 13243, 1820, 16409, 9, 4203, 24652, 3820, 3, 18, 7864, 329, 295, 127, 3, 18, 2143, 15, 24652, 3820, 1820, 16409, 9, 4203, 24652, 3820, 3, 18, 65, 382, 155, 109, 3, 18, 5286, 975, 9309, 1820, 37, 160, 32, 7864, 3, 1427, 3, 18, 2900, 591, 3, 18, 8, 5550, 5175, 7, 1225, 15, 1820, 16409, 9, 4203, 24652, 3820, 3, 18, 7864, 6838, 63, 3, 18, 901, 4074, 24652, 3820, 1820, 878, 35, 4203, 7, 5509, 1478, 63, 35, 3, 18, 30980, 35, 7, 3, 

In [None]:
for i in a["input_ids"]:
  print(f"{i} becomes {tokenizer.decode(i)}")

6514 becomes Final
25704 becomes confrontation
344 becomes between
160 becomes her
32 becomes o
11 becomes and
23132 becomes villain
3 becomes 
18 becomes -
3399 becomes Place
755 becomes 5
3 becomes 
18 becomes -
9034 becomes Bear
2834 becomes Island
1820 becomes |
16409 becomes Marg
9 becomes a
4203 becomes ery
24652 becomes Tyr
3820 becomes ell
3 becomes 
18 becomes -
2139 becomes helped
969 becomes by
3 becomes 
18 becomes -
901 becomes Al
4074 becomes erie
24652 becomes Tyr
3820 becomes ell
1820 becomes |
16409 becomes Marg
9 becomes a
4203 becomes ery
24652 becomes Tyr
3820 becomes ell
3 becomes 
18 becomes -
2870 becomes fight
7 becomes s
3 becomes 
18 becomes -
878 becomes Da
35 becomes en
4203 becomes ery
7 becomes s
5509 becomes Tar
1478 becomes gar
63 becomes y
35 becomes en
1820 becomes |
16409 becomes Marg
9 becomes a
4203 becomes ery
24652 becomes Tyr
3820 becomes ell
3 becomes 
18 becomes -
65 becomes has
4489 becomes Ho
1074 becomes use
3 becomes 
18 becomes -
1384 beco

In [None]:
def preprocess_data(data_to_process):
  #get the dialogue text
  inputs = [graph for graph in data_to_process[f'{how}Knowledge Graph']]
  #tokenize text
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)

  #tokenize labels
  #with tokenizer.as_target_tokenizer():
  targets = [target for target in data_to_process['story']]
  model_targets = tokenizer(targets, max_length=max_target, padding='max_length', truncation=True)
    
  model_inputs['labels'] = model_targets['input_ids']
  #reuturns input_ids, attention_masks, labels
  return model_inputs