# List of Beneficiaries, Addresses, Predicate

In [12]:
import json

DATASET_NAME = "AI_Act"
# DATASET_NAME = "DSA"
# DATASET_NAME = "GDPR"
FEATURE_TO_ANALYSE = "Addressees"
FEATURE_TO_ANALYSE = "Targets"
FEATURE_TO_ANALYSE = "ObligationTypeClassification"
# FEATURE_TO_ANALYSE = "Predicate"

DATASET_PATH = f"../data/processed/obligations_analysis/{DATASET_NAME}/obligation_analysis_system/{DATASET_NAME}.json"
dataset = json.load(open(DATASET_PATH))

print(f"Dataset size: {len(dataset)}")

Dataset size: 509


In [13]:
import re
from nltk.corpus import stopwords
from nltk import word_tokenize, WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def preprocess_value(target: str):
    if not target:
        return None

    target = target.lower()  # Convert to lowercase
    words = word_tokenize(target)  # Tokenize words

    # Remove stopwords at the start and end of the phrase
    while words and words[0] in stop_words:
        words.pop(0)
    while words and words[-1] in stop_words:
        words.pop()

    # Apply lemmatization to each word
    words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words) if words else None

def split_string(s):
    # Use regex to split by ",", "or", and "and" with optional spaces
    return [part.strip() for part in re.split(r',|\bor\b|\band\b', s) if part.strip()]

In [14]:
values_list = {}

for paragraph in dataset:
    potential_deontic = paragraph["potential_deontic"]

    # No potential deontic sentences
    if len(potential_deontic) == 0:
        continue

    # Retrieve analysed sentences
    for sentence in potential_deontic:

        if "analysis" not in sentence:
            continue

        analysed_sentence = sentence["analysis"]
        llm_output = analysed_sentence["output"]

        # For each obligation found in the sentence.
        for obligation_extracted in llm_output:

            extracted_feature_list = obligation_extracted[FEATURE_TO_ANALYSE]

            if extracted_feature_list is not list:
                extracted_feature_list = [extracted_feature_list]

            if len(extracted_feature_list) > 1:
                print(f"Multiple values found for feature '{FEATURE_TO_ANALYSE}': {extracted_feature_list}'")
            # For each value in the extracted feature list.
            for feature_structure in extracted_feature_list:

                if type(feature_structure) is list:
                    feature_structure = feature_structure[0]

                # Extract the extraction method.
                extraction_method = feature_structure["extraction_method"]
                extracted_value = feature_structure["value"]
                if extracted_value is None:
                    extracted_value = "unknown"

                split_values = split_string(extracted_value.lower())
                #print(f"Extracted value: '{extracted_value}' | Split values: '{split_values}'")

                for split_value in split_values:

                    split_value = preprocess_value(split_value)
                    # When it's the first time we encounter this value, initialize its extraction methods.
                    if split_value not in values_list:
                        values_list[split_value] = {
                            "extraction_methods": {
                                "Background-Knowledge": 0,
                                "Citation": 0,
                                "Context": 0,
                                "Stated": 0,
                                "None": 0
                            }
                        }

                    if extraction_method in values_list[split_value]["extraction_methods"]:
                        values_list[split_value]["extraction_methods"][extraction_method] += 1
                    else:
                        values_list[split_value]["extraction_methods"][extraction_method] = 1

# print(json.dumps(values_list, indent=4))


TypeError: string indices must be integers

In [10]:
import pandas as pd

# Transforming JSON into a DataFrame
df = pd.DataFrame.from_dict(
    {key: value["extraction_methods"] for key, value in values_list.items()},
    orient='index'
).reset_index()


# Renaming columns
df = df.rename(columns={
    "index": FEATURE_TO_ANALYSE,
    "Background-Knowledge": "Background-Knowledge",
    "Citation": "Citation",
    "Context": "Context",
    "Stated": "Stated",
    "None": "Not found"
})

# Add "Total" column
df["Total"] = df.iloc[:, 1:].sum(axis=1)

df.sample(n=20)

Unnamed: 0,Targets,Background-Knowledge,Citation,Context,Stated,Not found,Total
828,authorisation referred to in paragraph 1,0,0,0,1,0,1
175,single,0,0,0,1,0,1
331,conformity assessment module,0,0,1,1,0,2
210,deliverable on reporting,0,0,0,1,0,1
634,testing in real world condition is in accordan...,0,0,0,1,0,1
671,subcontractor,0,0,0,2,0,2
860,independence of the provider of a high-risk ai...,0,0,0,1,0,1
541,obligation referred to in article 53,0,0,0,1,0,1
479,reason therefor,0,0,0,4,0,4
840,correction,0,0,0,1,0,1


In [11]:
import os

output_folder = f"../data/analysis/distinct_entities_listing/{DATASET_NAME}"
os.makedirs(output_folder, exist_ok=True)
output_path = f"{output_folder}/{FEATURE_TO_ANALYSE}.xlsx"

df.to_excel(output_path, index=False)

## Semantic similarity

In [6]:
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('all-mpnet-base-v2')

In [34]:
# Input string to compare
input_string = "provider"
input_embedding = model.encode(input_string, convert_to_tensor=True)
value_embeddings = model.encode(df[FEATURE_TO_ANALYSE].tolist(), convert_to_tensor=True)
# Compute cosine similarities
cosine_scores = util.pytorch_cos_sim(input_embedding, value_embeddings).squeeze(0)

# Attach scores to the DataFrame and sort by similarity
df["Similarity"] = cosine_scores.cpu().numpy()
top_50 = df.sort_values(by="Similarity", ascending=False)

# Print the results
top_50[[FEATURE_TO_ANALYSE, "Similarity"]]

Unnamed: 0,Addressees,Similarity
4,provider,1.000000
46,service provider,0.813695
69,provider concerned,0.720373
2,provider of intermediary service,0.625407
24,provider of hosting service,0.623186
...,...,...
79,legal personality,0.121515
66,online search engine concerned,0.118420
81,constitution,0.114118
55,head of the compliance function,0.076397
