In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

filename = "raw_data/train.json"

# Load JSON data
with open(filename, "r") as file:
    data = json.load(file)

In [None]:
# Initialize lists to store PII data, positions, and spacing
pii_data = []
pii_positions = []
spacing = []
pii_ratio = []

# Iterate through the elements
for item in data:
    tokens = item["tokens"]
    labels = item["labels"]

    # Extract PII data and their positions
    current_pii = []
    current_pii_positions = []
    item_pii_data = []

    for i, (token, label) in enumerate(zip(tokens, labels)):
        if label != "O":  # Check if the token is labeled as PII
            current_pii.append((token, label[2:]))
            current_pii_positions.append([label[2:], i, len(tokens)])
        else:
            if current_pii:
                item_pii_data.extend(current_pii)
                pii_positions.append([item["document"]] + current_pii_positions[0])
                current_pii = []
                current_pii_positions = []

    # Calculate ratio of PII Tokens / 10000 Tokens
    pii_ratio.append((len(item_pii_data) / len(labels)) * 10000)
    pii_data.extend(item_pii_data)

In [None]:
# Plot the PII Ratio across documents
plt.figure(figsize=(4, 3))
sns.histplot(x=pii_ratio, bins=12, color="skyblue")
plt.title("Ratio of PII Tokens in each Document")
plt.xlabel("Number of PII Tokens / 10,000 Tokens")
plt.ylabel("Logarithmic Frequency")
plt.yscale("log")

In [None]:
# Label frequency
pii_df = pd.DataFrame(pii_data, columns=["Token", "Label"])
pii_count = pii_df["Label"].value_counts().reset_index()
pii_count.columns = ["Label", "Frequency"]
pii_count = pii_count.sort_values(by="Frequency", ascending=False)

In [None]:
# Plot the PII labels and their frequencies
plt.figure(figsize=(4, 3))
sns.barplot(x="Frequency", y="Label", data=pii_count, palette="pastel", hue="Label")
plt.xlabel("Logarithmic Frequency")
plt.ylabel("PII Label")
plt.title("Frequency of PII Labels")
plt.xticks(rotation=0)
plt.xscale("log")
plt.show()

In [None]:
# PII Location
pii_position_normalized = [
    (label, pos / total) for _, label, pos, total in pii_positions
]
pii_position_df = pd.DataFrame(pii_position_normalized, columns=["Label", "Position"])

In [None]:
# Plot the location of PIIs in the documents
plt.figure(figsize=(7, 5))
sns.histplot(
    pii_position_df,
    x="Position",
    bins=20,
    palette="pastel",
    hue="Label",
    multiple="stack",
)
plt.title("Position of PII Token in a Document")
plt.xlabel("Location")
plt.ylabel("Frequency")
plt.yscale("log")
plt.show()

In [None]:
# PII Label Token Distance
labels = set([pii[1] for pii in pii_positions])
label_distance = {la1: {la2: [] for la2 in labels} for la1 in labels}

# Calculate distances between PII tokens in the same document
for doc1, label1, pos1, _ in pii_positions:
    for doc2, label2, pos2, _ in pii_positions:
        if doc1 == doc2 and pos1 != pos2:
            label_distance[label1][label2].append(abs(pos1 - pos2))


label_distance_df = pd.DataFrame(label_distance)
label_avg_distance = label_distance_df.map(lambda x: np.mean(x))

In [None]:
# Plot the spacing between PII data
plt.figure(figsize=(5, 4))
sns.heatmap(label_avg_distance, annot=True, fmt=".0f", linewidth=0.5, cmap="crest")
plt.title("Average Distance (Number of Tokens) Between PII Labels")
plt.show()

In [None]:
from wordcloud import WordCloud

# Create a word cloud of PII data
wordcloud = WordCloud(width=900, height=600, background_color="white").generate(
    " ".join(pii_df["Token"])
)
# plt.figure(figsize=(7, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
# plt.title('Word Cloud of PII Data')
plt.show()

In [None]:
pii_df["Token Length"] = pii_df["Token"].apply(len)
plt.figure(figsize=(4, 3))
sns.histplot(pii_df["Token Length"], bins=15, color="skyblue")
plt.xlabel("Token Length (in Alphabetic Characters)")
plt.ylabel("Frequency")
plt.yscale("log")
plt.title("Distribution of PII Token Lengths")
plt.show()