<a href="https://colab.research.google.com/github/ven-diagram/COMP61332_Coursework/blob/main/DepPath_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RelPath-SVM

This is the notebook for an SVM focused on using dependency paths and ensemble learningfor relation extraction

In [53]:
# Clone the WebNLG dataset

!git clone https://gitlab.com/shimorina/webnlg-dataset.git

Cloning into 'webnlg-dataset'...
remote: Enumerating objects: 5112, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 5112 (delta 2), reused 0 (delta 0), pack-reused 5106 (from 1)[K
Receiving objects: 100% (5112/5112), 26.09 MiB | 16.10 MiB/s, done.
Resolving deltas: 100% (4010/4010), done.
Updating files: 100% (1425/1425), done.


In [54]:
# Clone the WebNLG toolkit

!git clone https://github.com/WebNLG/webnlg_toolkit.git
%cd webnlg_toolkit
!pip install -e .

Cloning into 'webnlg_toolkit'...
remote: Enumerating objects: 230, done.[K
remote: Counting objects: 100% (230/230), done.[K
remote: Compressing objects: 100% (188/188), done.[K
remote: Total 230 (delta 45), reused 211 (delta 29), pack-reused 0 (from 0)[K
Receiving objects: 100% (230/230), 16.78 MiB | 6.96 MiB/s, done.
Resolving deltas: 100% (45/45), done.
/content/webnlg_toolkit/webnlg_toolkit/webnlg_toolkit
Obtaining file:///content/webnlg_toolkit/webnlg_toolkit/webnlg_toolkit
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: webnlg_toolkit
  Attempting uninstall: webnlg_toolkit
    Found existing installation: webnlg_toolkit 0.0.1
    Uninstalling webnlg_toolkit-0.0.1:
      Successfully uninstalled webnlg_toolkit-0.0.1
  Running setup.py develop for webnlg_toolkit
Successfully installed webnlg_toolkit-0.0.1


In [55]:
# Useful imports

import os
import glob
import json
import numpy as np
import pandas as pd
import torch
import re
import spacy

from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split

!pip install datasets
from datasets import Dataset as HFDataset
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import webnlg_toolkit
from webnlg_toolkit.utils.data import load_webnlg_dataset
from webnlg_toolkit.utils.data import load_webnlg_xml




# Pre-processing the data

The next section will focus on pre-processing the data and getting it in the form required by extracting the sentences, relations and entities into a training and test dataframe.

In [56]:
# Define paths
dataset_root = "/content/webnlg-dataset/release_v3.0/en/train/"
test_root = "/content/webnlg-dataset/release_v3.0/en/dev/"

# Function to find dataset files
def find_dataset_files(root_folder, file_extension=".xml"):
    dataset_files = []
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.endswith(file_extension):
                dataset_files.append(os.path.join(subdir, file))
    return dataset_files

# Load training data
def load_all_data(root_folder):
    dataset_files = find_dataset_files(root_folder, file_extension=".xml")
    all_data = []
    for file in dataset_files:
        print(f"Loading: {file}")
        data = load_webnlg_dataset(file, task="rdf2text")
        # print(data)
        all_data.extend(data)
    return all_data

# Load training and test data
train_data = load_all_data(dataset_root)
test_data = load_all_data(test_root)

# Convert to DataFrames
train_df = pd.DataFrame(train_data, columns=["input", "output"])
test_df = pd.DataFrame(test_data, columns=["input", "output"])
print(f"Training examples: {len(train_df)}")
print(f"Test examples: {len(test_df)}")

# Display some examples
print("\nTraining examples:")
print(train_df.head(3))

print("\nTest examples:")
print(test_df.head(3))

Loading: /content/webnlg-dataset/release_v3.0/en/train/5triples/University.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/5triples/Company.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/5triples/ComicsCharacter.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/5triples/SportsTeam.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/5triples/Artist.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/5triples/Airport.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/5triples/CelestialBody.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/5triples/Monument.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/5triples/Politician.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/5triples/MeanOfTransportation.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/5triples/Athlete.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/5triples/City.xml
Loading: /content/webnlg-dataset/release_v3.0/en/train/

In [57]:
# Extract the relations from the file in the format <P> relation <O>
def extract_relations(input_text):
    relations = []
    matches = re.findall(r'<P>\s(.*?)\s<O>', input_text)
    return matches

# Apply relation extraction to create a new column with relation types
train_df['relations'] = train_df['input'].apply(extract_relations)
test_df['relations'] = test_df['input'].apply(extract_relations)

print(train_df['relations'])

0                 [nickname, rector, city, country, state]
1                 [nickname, rector, city, country, state]
2                 [nickname, rector, city, country, state]
3        [country, academic staff size, state, has to i...
4        [country, academic staff size, state, has to i...
                               ...                        
35421    [mission, alma mater, birth place, backup pilo...
35422    [mission, alma mater, birth place, backup pilo...
35423    [selected by nasa, nationality, status, birth ...
35424    [selected by nasa, nationality, status, birth ...
35425    [selected by nasa, nationality, status, birth ...
Name: relations, Length: 35426, dtype: object


In [58]:
# Extracts the entities from an input in the format <S> entity1 <P> relation <O> entity2
def extract_entities(input_text):
    entity_pairs = []
    matches = re.findall(r'<S>\s(.*?)\s<P>.*?<O>\s(.*?)$', input_text)
    return matches

# Apply entity extraction to create a new column with entity pairs
train_df['entity_pairs'] = train_df['input'].apply(extract_entities)
test_df['entity_pairs'] = test_df['input'].apply(extract_entities)

print(train_df['entity_pairs'])

0             [(1 Decembrie 1918 University, Alba)]
1             [(1 Decembrie 1918 University, Alba)]
2             [(1 Decembrie 1918 University, Alba)]
3        [(AWH Engineering College, Kuttikkattoor)]
4        [(AWH Engineering College, Kuttikkattoor)]
                            ...                    
35421                      [(William Anders, 1963)]
35422                      [(William Anders, 1963)]
35423           [(William Anders, AFIT, M.S. 1962)]
35424           [(William Anders, AFIT, M.S. 1962)]
35425           [(William Anders, AFIT, M.S. 1962)]
Name: entity_pairs, Length: 35426, dtype: object


In [59]:
# Check we have loaded the correct amount of relations
all_relations = []
for relations in train_df['relations']:
    all_relations.extend(relations)
unique_relations = sorted(list(set(all_relations)))

print(f"\nNumber of unique relation types: {len(unique_relations)}")
print("Example relation types:", unique_relations[:5])


Number of unique relation types: 372
Example relation types: [' l c c n_number', ' national register of historic places reference number', '1st runway length feet', '1st runway length metre', '1st runway number']


# Extracting dependency paths

We extract the dependency path of the entities using spaCys English NER, parser, tagger and tokeniser.

In [60]:
# Load a natural language processor using spaCy
nlp = spacy.load("en_core_web_sm")

# Extract the dependency path of entity1 and entity2
def extract_dependency_path(input_text, entity1, entity2):
    doc = nlp(input_text)
    dep_path = []

    for token in doc:
        if token.text in [entity1, entity2]:
            dep_path.append(token.dep_)

    return " ".join(dep_path)

train_df["dep_path"] = train_df.apply(lambda row: extract_dependency_path(row["output"], row["entity_pairs"][0][0], row["entity_pairs"][0][1]), axis=1)
test_df['dep_path'] = test_df.apply(lambda row: extract_dependency_path(row["input"], row["entity_pairs"][0][0], row["entity_pairs"][0][1]), axis=1)

# Prepare the training and test datasets

We now add context to the input sentences such as the position of their entities and the dependency paths calculated. The relations are their corresponding labels.

In [61]:
# Create a simplified dataset for SVM
def prepare_data(df):
    X = []
    y = []

    # Add the sentences their respective relations to the datasets X and y
    for _, row in df.iterrows():
        if len(row['entity_pairs']) > 0 and len(row['relations']) > 0:
            entity1, entity2 = row['entity_pairs'][0]
            relation = row['relations'][0]
            sentence = row['input']

            # Highlight entities to preserve context
            sentence = sentence.replace(entity1, f"<E1> {entity1} </E1>")
            sentence = sentence.replace(entity2, f"<E2> {entity2} </E2>")

            # Concatenate on the dependency path to the end of the sentence to add further context to the sentence
            sentence += row["dep_path"]

            X.append(sentence)
            y.append(relation)

    return X, y

# Prepare data
X_train, y_train = prepare_data(train_df)
X_test, y_test = prepare_data(test_df)

print(f"SVM training examples: {len(X_train)}")
print(f"SVM test examples: {len(X_test)}")

SVM training examples: 35426
SVM test examples: 4464


# Fit the pipeline

We use a voting classifer to find the best svm given the context of the sentence. We use a TfidVectorizer to vectorize the main sentence that is fed, and use the countVectorizer to target the dependency paths.

In [62]:
# Define three seperate support vector classifiers with different kernels
spatial_svm = SVC(kernel='linear')
temporal_svm = SVC(kernel='rbf')
causal_svm = SVC(kernel='poly')

# Define a voting Classifier with hard voting that will output the class with the highest majority of votes
ensemble_svm = VotingClassifier(estimators=[
    ('spatial', spatial_svm),
    ('temporal', temporal_svm),
    ('causal', causal_svm)
], voting='hard')

# Create SVM pipeline
svm_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tfidf_text', TfidfVectorizer(max_features=5000)),  # Main sentence text
        ('count_dep', CountVectorizer()),  # Dependency paths (TF-IDF)
    ])),
    ('svm_ensemble', ensemble_svm)
])

In [63]:
# Train SVM
print("Training SVM model...")
svm_pipeline.fit(X_train, y_train)

Training SVM model...


# Evaluate the SVM

We use the f1-score to evaluate the SVM. other statistics are provided to verify the model is running as expected.

In [64]:
# Evaluate SVM
svm_preds = svm_pipeline.predict(X_test)
svm_accuracy = (svm_preds == y_test).mean()
svm_f1 = f1_score(y_test, svm_preds, average='weighted')

print(f"SVM Accuracy: {svm_accuracy:.4f}")
print(f"SVM F1 Score: {svm_f1:.4f}")

SVM Accuracy: 0.8324
SVM F1 Score: 0.8155


In [65]:
# Classification report for SVM
print("\nSVM Classification Report:")
print(classification_report(y_test, svm_preds))


SVM Classification Report:
                                                   precision    recall  f1-score   support

                                   l c c n_number       0.80      1.00      0.89         8
                          1st runway surface type       0.60      1.00      0.75         3
                          2nd runway surface type       1.00      1.00      1.00         3
                          3rd runway surface type       1.00      1.00      1.00         8
                          4th runway surface type       0.00      0.00      0.00         2
                                     abbreviation       0.65      1.00      0.78        20
                              academic discipline       0.73      0.62      0.67        13
                              academic staff size       0.85      1.00      0.92        17
                            active years end date       1.00      1.00      1.00         1
                          active years start date       1.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Real-time inference for SVM relation extraction

A user may input two entities and the output will be the the given relation that the svm has extracted. The user may pick a random entity pairs from the WebNLG test set, or choose their own.

In [66]:
# Function for SVM real-time inference
def svm_extract_relation(entity1, entity2):
    # Create input text
    text = f"{entity1} {entity2}"

    # Make prediction
    prediction = svm_pipeline.predict([text])

    return prediction

In [67]:
# Load example dataset
example_data = load_webnlg_dataset("/content/webnlg-dataset/release_v3.0/en/test/rdf-to-text-generation-test-data-without-refs-en.xml", task="rdf2text")
example_df = pd.DataFrame(example_data, columns=["input", "output"])
example_df['entity_pairs'] = test_df['input'].apply(extract_entities)

# Collect all unique enitity pairs
example_df_entity_pairs = set(tuple(x) for x in example_df['entity_pairs'])
example_df_entity_pairs = list(example_df_entity_pairs)

# Function that lets user choosea pair of entities from example dataset
def example_entity_pair(num):
    if num < 1 or num > len(example_df_entity_pairs):
        raise ValueError(f"Invalid index: {num}. Must be between 1 and {len(test_df['entity_pairs'])}.")

    entity1, entity2 = example_df_entity_pairs[num - 1][0]
    return entity1, entity2

# Total number of possible example entity pairs
print(len(example_df_entity_pairs))

1127


In [69]:
# User may input a number here to get entity pairs
entity1, entity2 = example_entity_pair(8)

# Or uncomment here to manually input their own enitity pairs
# entity1 = "MotorSport Vision"
# entity2 = "Fawkham"

print(f"Entity 1: {entity1}\nEntity 2: {entity2}")
svm_relation = svm_extract_relation(entity1, entity2)
print(f"Extracted relation: {svm_relation}")

Entity 1: Antares (rocket)
Entity 2: 2014-10-28
Extracted relation: ['maiden flight']
