In [1]:
from datasets import load_dataset
from dotenv import load_dotenv

import os

load_dotenv()

assert "HF_TOKEN" in os.environ, "HF_TOKEN is not set"

ds = load_dataset("ai4privacy/open-pii-masking-500k-ai4privacy")

In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script', 'mbert_tokens', 'mbert_token_classes'],
        num_rows: 464150
    })
    validation: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script', 'mbert_tokens', 'mbert_token_classes'],
        num_rows: 116077
    })
})

In [3]:
ds["train"][0]

{'source_text': '20:10:26 Venanzius Höttermann Revés యొక్క వివాహం July/95 నాడు జరిగింది, Tadaలో Faizabad Road వద్ద.',
 'masked_text': '[TIME_1] [GIVENNAME_1] [SURNAME_1] యొక్క వివాహం [DATE_1] నాడు జరిగింది, [CITY_1]లో [STREET_1] వద్ద.',
 'privacy_mask': [{'label': 'TIME',
   'start': 0,
   'end': 8,
   'value': '20:10:26',
   'label_index': 1},
  {'label': 'GIVENNAME',
   'start': 9,
   'end': 18,
   'value': 'Venanzius',
   'label_index': 1},
  {'label': 'SURNAME',
   'start': 19,
   'end': 35,
   'value': 'Höttermann Revés',
   'label_index': 1},
  {'label': 'DATE',
   'start': 49,
   'end': 56,
   'value': 'July/95',
   'label_index': 1},
  {'label': 'CITY', 'start': 72, 'end': 76, 'value': 'Tada', 'label_index': 1},
  {'label': 'STREET',
   'start': 79,
   'end': 92,
   'value': 'Faizabad Road',
   'label_index': 1}],
 'split': 'train',
 'uid': 5387382,
 'language': 'te',
 'region': 'IN',
 'script': 'Telu',
 'mbert_tokens': ['20',
  ':',
  '10',
  ':',
  '26',
  'Ve',
  '##nan',
  

In [4]:
ds = ds.filter(lambda x: x["language"] == "en")
ds = ds.remove_columns(["mbert_tokens", "mbert_token_classes"])

ds

DatasetDict({
    train: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script'],
        num_rows: 120533
    })
    validation: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script'],
        num_rows: 30160
    })
})

In [5]:
ds["train"][2000]

{'source_text': 'For verification purposes, please provide a copy of your 757861235 and LIEVE.752280.L9.175.',
 'masked_text': 'For verification purposes, please provide a copy of your [PASSPORTNUM_1] and [DRIVERLICENSENUM_1].',
 'privacy_mask': [{'label': 'PASSPORTNUM',
   'start': 57,
   'end': 66,
   'value': '757861235',
   'label_index': 1},
  {'label': 'DRIVERLICENSENUM',
   'start': 71,
   'end': 90,
   'value': 'LIEVE.752280.L9.175',
   'label_index': 1}],
 'split': 'train',
 'uid': 5956358,
 'language': 'en',
 'region': 'GB',
 'script': 'Latn'}

In [6]:
# SURNAME, GIVENNAME

In [7]:
ds = ds.filter(lambda item: "SURNAME" in item["masked_text"] or "GIVENNAME" in item["masked_text"])

ds

DatasetDict({
    train: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script'],
        num_rows: 85753
    })
    validation: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script'],
        num_rows: 21412
    })
})

In [8]:
import random


for ii in random.sample(range(len(ds["train"])), 20):
    print(ds["train"][ii]["source_text"])
    print(ds["train"][ii]["masked_text"])
    print()

Hello Kaleah Mrika, we are excited to share our new collection of handcrafted jewelry designs with you. Please join us at County Road 407 473, Willow Park to explore our latest pieces.
Hello [GIVENNAME_1], we are excited to share our new collection of handcrafted jewelry designs with you. Please join us at [STREET_1] [BUILDINGNUM_1], [CITY_1] to explore our latest pieces.

Join our Aquaponics Project and learn how to grow your own food! 
For more information, please contact Liran Stefanák at +58-78073 1585 or tamanamasarwy@hotmail.com.
Join our Aquaponics Project and learn how to grow your own food! 
For more information, please contact [GIVENNAME_1] [SURNAME_1] at [TELEPHONENUM_1] or [EMAIL_1].

Hi Oktawian Raid, could you please confirm the address Coplow Lane 4861 for delivery of your package?
Hi [GIVENNAME_1], could you please confirm the address [STREET_1] [BUILDINGNUM_1] for delivery of your package?

Esengjul is so passionate about their art, it's infectious and inspiring.
[GIVE

In [9]:
ds_with_two_persons = ds.filter(lambda item: "SURNAME_2" in item["masked_text"] or "GIVENNAME_2" in item["masked_text"])

ds_with_two_persons

DatasetDict({
    train: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script'],
        num_rows: 13312
    })
    validation: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script'],
        num_rows: 3311
    })
})

In [10]:
ds_with_three_persons = ds.filter(
    lambda item: "SURNAME_3" in item["masked_text"] or "GIVENNAME_3" in item["masked_text"]
)

ds_with_three_persons

DatasetDict({
    train: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script'],
        num_rows: 1634
    })
    validation: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script'],
        num_rows: 416
    })
})

In [11]:
ds_with_three_persons["train"][130]

{'source_text': "6:41:30 PM Yurii: Rüben got anxious when Tianze revealed their new phone number. Now everyone knows it's 0170 240-8307",
 'masked_text': "[TIME_1] [GIVENNAME_3]: [GIVENNAME_2] got anxious when [GIVENNAME_1] revealed their new phone number. Now everyone knows it's [TELEPHONENUM_1]",
 'privacy_mask': [{'label': 'TIME',
   'start': 0,
   'end': 10,
   'value': '6:41:30 PM',
   'label_index': 1},
  {'label': 'GIVENNAME',
   'start': 11,
   'end': 16,
   'value': 'Yurii',
   'label_index': 3},
  {'label': 'GIVENNAME',
   'start': 18,
   'end': 23,
   'value': 'Rüben',
   'label_index': 2},
  {'label': 'GIVENNAME',
   'start': 41,
   'end': 47,
   'value': 'Tianze',
   'label_index': 1},
  {'label': 'TELEPHONENUM',
   'start': 105,
   'end': 118,
   'value': '0170 240-8307',
   'label_index': 1}],
 'split': 'train',
 'uid': 5633848,
 'language': 'en',
 'region': 'US',
 'script': 'Latn'}

In [12]:
type(ds_with_three_persons["validation"])

datasets.arrow_dataset.Dataset

In [13]:
from datasets import Dataset
import pandas as pd


def prepare_dataset(data: Dataset) -> pd.DataFrame:
    dataset = []

    for item in data:
        text = item["source_text"]

        privacy_mask = item["privacy_mask"]

        names = dict()

        for mask_item in privacy_mask:
            label_class = mask_item["label"]  # SURNAME or GIVENNAME
            if label_class not in ["SURNAME", "GIVENNAME"]:
                continue

            label_index = mask_item["label_index"]

            if label_index not in names:
                names[label_index] = {"SURNAME": None, "GIVENNAME": None}

            names[label_index][label_class] = mask_item["value"]

        names_list = []
        for label_index, label_classes in names.items():
            full_name = ""

            surname = label_classes["SURNAME"]
            given_name = label_classes["GIVENNAME"]

            if surname:
                full_name += surname

            if given_name:
                if full_name:
                    full_name += " "
                full_name += given_name

            names_list.append(full_name)

        dataset.append({"text": text, "masked_text": item["masked_text"], "names": names_list})

    return pd.DataFrame(dataset)


train_df_with_three_persons = prepare_dataset(ds_with_three_persons["validation"])

print(f"Length of train_df_with_three_persons: {len(train_df_with_three_persons)}")
train_df_with_three_persons.head()

Length of train_df_with_three_persons: 416


Unnamed: 0,text,masked_text,names
0,"The PalindromesGroup chat includes Salomée, Qé...",The PalindromesGroup chat includes [GIVENNAME_...,"[Salomée, Qéndresa Adrina, Vithursan]"
1,Narcis and Aiza visited the Princes Risborough...,[GIVENNAME_3] and [GIVENNAME_2] visited the [C...,"[Narcis, Aiza, Faverjon Patrick-Marc]"
2,"07:51:08 Menelaos Iouri to Chayanne: 'Hey, I n...","[TIME_1] [GIVENNAME_3] to [GIVENNAME_2]: 'Hey,...","[Menelaos Iouri, Chayanne, Sivalogini]"
3,The following people are attending the Stourbr...,The following people are attending the [CITY_1...,"[Norell, Visanu, Tancev Eduard-Ionut]"
4,Thevanesan: Hey Saliba! I need your help in dr...,[GIVENNAME_3]: Hey [GIVENNAME_2]! I need your ...,"[Thevanesan, Saliba, Dionisa Magbule]"


In [14]:
from pyrootutils import find_root

output_data_path = find_root() / "data" / "ai4privacy-many-persons-validation.csv"
train_df_with_three_persons.to_csv(output_data_path, index=False)

print(f"Saved dataset to {output_data_path.relative_to(find_root())}")

Saved dataset to data/ai4privacy-many-persons-validation.csv


In [15]:
index = 97
print(train_df_with_three_persons.iloc[index].text)
print(train_df_with_three_persons.iloc[index].masked_text)

The participant list for the Scotland Yard workshop includes Sathyanarayanan Jawid, Vergil, and Andryn.
The participant list for the Scotland Yard workshop includes [GIVENNAME_3], [GIVENNAME_2], and [GIVENNAME_1].


In [20]:
version = "v1"
first_iter_path = find_root() / "data" / "ai4privacy-v1" / "raw.md"

data_dict = train_df_with_three_persons.iloc[:50].to_dict(orient="records")

result_texts = []
for item in data_dict:
    result_texts.append(item["text"])
    for name in item["names"]:
        result_texts.append(f"- {name}")

    result_texts.append("\n")

out_text = "\n".join(result_texts)


first_iter_path.write_text(out_text)

11796

In [15]:
### Convert preprocessed data from markdown to csv

import pandas as pd
from pyrootutils import find_root
from pathlib import Path

version = "v1"
cleaned_path = find_root() / "data" / f"ai4privacy-{version}" / "cleaned.md"


def parse_md_to_csv(md_path: Path) -> pd.DataFrame:
    """Parse lines of markdown file with I manually cleaned into csv dataset in original format."""
    data = md_path.read_text()
    items = data.split("\n\n\n")

    df = []

    for item in items:
        source_text = []  # source text could be multiline
        extracted_names = []

        for line in item.split("\n"):
            if not line:
                continue

            if line[0] == "-":
                name = line[2:].split(" +")[0]
                extracted_names.append(name)
            else:
                source_text.append(line)

        source_text = "\n".join(source_text)
        df.append({"text": source_text, "names": extracted_names})

    df = pd.DataFrame(df)
    return df


df = parse_md_to_csv(cleaned_path)
df.to_csv(cleaned_path.with_suffix(".csv"), index=False)
df

Unnamed: 0,text,names
0,"The PalindromesGroup chat includes Salomée, Qé...","[Salomée, Qéndresa Adrina, Vithursan]"
1,Narcis and Aiza visited the Princes Risborough...,"[Narcis, Aiza, Faverjon Patrick-Marc]"
2,"07:51:08 Menelaos Iouri to Chayanne: 'Hey, I n...","[Menelaos Iouri, Chayanne, Sivalogini]"
3,The following people are attending the Stourbr...,"[Norell, Visanu, Tancev Eduard-Ionut]"
4,Thevanesan: Hey Saliba! I need your help in dr...,"[Thevanesan, Saliba, Magbule Dionisa]"
5,The garden bench memorial for Yushan Marie-Edi...,"[Zaharievska Yushan Marie-Edith, Borde Kely, C..."
6,"Event attendees list: Tanea, Leondrit Olivier,...","[Tanea, Leondrit Olivier, Baschir Radomir]"
7,"Blue Planet volunteer list: Pathmajothy, Mohsi...","[Pathmajothy, Mohsine Nouriel, Miao]"
8,The following individuals have contributed to ...,"[Stoia Blenda, Kirchmayr Losmaz Elycia, Dutran..."
9,From: Aberash Bajrishe Cosmadopoulos\nTo: Sula...,"[Cosmadopoulos Aberash Bajrishe, Lorenzon Sula..."
