In [1]:
import random
import datasets
import pandas as pd

from pathlib import Path
from IPython.display import display, HTML

from datasets import load_dataset
from frame.framenet import data_paths
from transformers import AutoTokenizer

In [2]:
def show_random_examples(dataset, num_examples=5):
    """Display `num_examples` of the `dataset`
    
    Args:
        dataset: HuggingFace DatasetDict from preprocessed framenet
        num_examples: number of random examples to display
    """
    samples = []
    for _ in range(num_examples):
        example = random.randint(0, len(dataset)-1)
        samples.append(example)
    
    df = pd.DataFrame(dataset[samples])
    display(HTML(df.to_html()))

In [3]:
# root path of json data preprocessed by `frame`
datapath = "/Users/ygx/dat/frames/"

In [4]:
# This requires having proprocessed the data using `frame`'s
# frame.cli:preprocess-framenet
paths = data_paths(datapath)

In [5]:
# picking the right file format is half the battle
dataset = load_dataset('json', data_files=paths)

Using custom data configuration default-759eabcb88cab5b7
Reusing dataset json (/Users/ygx/.cache/huggingface/datasets/json/default-759eabcb88cab5b7/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02)


In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['frame', 'sentence', 'frame_definition'],
        num_rows: 100000
    })
})

In [7]:
dataset.data

{'train': pyarrow.Table
 frame: string
 sentence: string
 frame_definition: string}

In [8]:
show_random_examples(dataset["train"])

Unnamed: 0,frame,sentence,frame_definition
0,Familiarity,It was also the time when the region was the h...,An Entity is presented as having been seen or ...
1,Attack,The goal of the current United States-led offe...,An Assailant physically attacks a Victim (whic...
2,Feigning,"She feigned enthusiasm , but what she was real...",An Agent acts in such a way as to give the inc...
3,Body_parts,Only other white-breasted duck is short-necked...,This frame covers words for Body_part(s) (BP) ...
4,Body_description_holistic,"Stocky and friendly , he had two small flaws i...",This frame covers descriptions of an entire hu...


## Tokenizing Seq2Seq Data with T5

In [9]:
model = "t5-small"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [11]:
# The tokenizer itself returns input_ids and base attention masks
tokenizer(dataset["train"]['sentence'][0])

{'input_ids': [20439, 2925, 12, 2662, 115, 1152, 550, 3, 6, 112, 1234, 1413, 4339, 3843, 28, 385, 91, 5808, 7, 17, 7, 13, 22496, 3, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    """Tokenize the data for Seq2Seq
    
    Maps over all the examples in the dataset 
    to tokenize both the input framenet sentences
    and the target frame definitions.
    
    Args:
        examples: samples in the dataset
    """
    inputs = [sent for sent in examples["sentence"]]
    
    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["frame_definition"], 
            max_length=max_target_length, 
            truncation=True
        )
    
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [13]:
# peek into how preprocessing looks for the first example
preprocess_function(dataset['train'][0])

{'input_ids': [[377, 1], [3, 52, 1], [3, 9, 1], [3, 29, 1], [3, 75, 1], [3, 32, 1], [1], [3, 75, 1], [3, 32, 1], [3, 29, 1], [3, 17, 1], [3, 23, 1], [3, 29, 1], [3, 76, 1], [3, 15, 1], [3, 26, 1], [1], [3, 17, 1], [3, 32, 1], [1], [3, 354, 1], [3, 9, 1], [3, 115, 1], [3, 115, 1], [3, 15, 1], [3, 52, 1], [1], [3, 9, 1], [3, 210, 1], [3, 9, 1], [3, 63, 1], [1], [3, 6, 1], [1], [3, 107, 1], [3, 23, 1], [3, 7, 1], [1], [3, 210, 1], [3, 32, 1], [3, 52, 1], [3, 26, 1], [3, 7, 1], [1], [3, 23, 1], [3, 29, 1], [3, 17, 1], [3, 15, 1], [3, 52, 1], [3, 7, 1], [3, 102, 1], [3, 15, 1], [3, 52, 1], [3, 7, 1], [3, 15, 1], [3, 26, 1], [1], [3, 210, 1], [3, 23, 1], [3, 17, 1], [3, 107, 1], [1], [3, 40, 1], [3, 23, 1], [3, 17, 1], [3, 17, 1], [3, 40, 1], [3, 15, 1], [1], [3, 32, 1], [3, 76, 1], [3, 17, 1], [3, 115, 1], [3, 76, 1], [3, 52, 1], [3, 7, 1], [3, 17, 1], [3, 7, 1], [1], [3, 32, 1], [3, 89, 1], [1], [3, 40, 1], [3, 9, 1], [3, 76, 1], [3, 122, 1], [3, 107, 1], [3, 17, 1], [3, 15, 1], [3, 52, 1]

In [14]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [15]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'frame', 'frame_definition', 'input_ids', 'labels', 'sentence'],
        num_rows: 100000
    })
})