In [1]:
!git clone https://gitlab.com/shimorina/webnlg-dataset.git

Cloning into 'webnlg-dataset'...
remote: Enumerating objects: 5112, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 5112 (delta 2), reused 0 (delta 0), pack-reused 5106 (from 1)[K
Receiving objects: 100% (5112/5112), 26.09 MiB | 22.66 MiB/s, done.
Resolving deltas: 100% (4010/4010), done.
Updating files: 100% (1425/1425), done.


In [2]:
!git clone https://github.com/WebNLG/webnlg_toolkit.git
%cd webnlg_toolkit
%ls
!pip install -e .

Cloning into 'webnlg_toolkit'...
remote: Enumerating objects: 230, done.[K
remote: Counting objects: 100% (230/230), done.[K
remote: Compressing objects: 100% (188/188), done.[K
remote: Total 230 (delta 45), reused 211 (delta 29), pack-reused 0 (from 0)[K
Receiving objects: 100% (230/230), 16.78 MiB | 16.54 MiB/s, done.
Resolving deltas: 100% (45/45), done.
/content/webnlg_toolkit
LICENSE  README.md  requirements.txt  setup.py  [0m[01;34mwebnlg_toolkit[0m/
Obtaining file:///content/webnlg_toolkit
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: webnlg_toolkit
  Running setup.py develop for webnlg_toolkit
Successfully installed webnlg_toolkit-0.0.1


In [3]:
import os
import glob
import json
import numpy as np
import pandas as pd
import torch
import re
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    AdamW,
    get_linear_schedule_with_warmup
)
!pip install datasets
from datasets import Dataset as HFDataset
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import webnlg_toolkit
from webnlg_toolkit.utils.data import load_webnlg_dataset
from webnlg_toolkit.utils.data import load_webnlg_xml

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [4]:
# Define paths
#dataset_root = "/content/drive/MyDrive/en/train/"
#test_root = "/content/drive/MyDrive/en/dev/"

dataset_root = "/content/webnlg-dataset/release_v3.0/en/train/"
test_root = "/content/webnlg-dataset/release_v3.0/en/dev/"

# Function to find dataset files
def find_dataset_files(root_folder, file_extension=".xml"):
    dataset_files = []
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.endswith(file_extension):
                dataset_files.append(os.path.join(subdir, file))
    return dataset_files

# Load training data
def load_all_data(root_folder):
    dataset_files = find_dataset_files(root_folder, file_extension=".xml")
    all_data = []
    for file in dataset_files:
        print(f"Loading: {file}")
        data = load_webnlg_dataset(file, task="rdf2text")
        # print(data)
        all_data.extend(data)
    return all_data

# Load training and test data
train_data = load_all_data(dataset_root)
test_data = load_all_data(test_root)

# Convert to DataFrames
train_df = pd.DataFrame(train_data, columns=["input", "output"])
test_df = pd.DataFrame(test_data, columns=["input", "output"])
print(f"Training examples: {len(train_df)}")
print(f"Test examples: {len(test_df)}")

# Display some examples
print("\nTraining examples:")
print(train_df.head(3))

print("\nTest examples:")
print(test_df.head(3))

Loading: /content/webnlg-dataset/release_v3.0/en/train/1triples/CelestialBody_allSolutions.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/1triples/MeanOfTransportation_allSolutions.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/1triples/SportsTeam_allSolutions.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/1triples/Airport_allSolutions.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/1triples/Monument_allSolutions.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/1triples/Athlete_allSolutions.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/1triples/Astronaut_allSolutions.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/1triples/Building_allSolutions.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/1triples/Artist_allSolutions.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/1triples/University_allSolutions.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/1triples/Food_allSolutions.