## Anomalies and filtering

In [None]:
import json
import pandas as pd
import seaborn as sns
from pathlib import Path
from collections import namedtuple

Stat = namedtuple("Stat", ["label", "value"])
stats = []
dtypes = json.loads(Path("data/dtypes.json").read_text())
df = pd.read_csv("data/vcdb.csv", dtype=dtypes)
stats.append(Stat("Original Dataset Shape", df.shape))

# Remove columns with only empty values
df.dropna(axis="columns", how="all", inplace=True)
stats.append(Stat("Removed Columns Shape", df.shape))

# Filter old incidents
since = 2013
until = 2024
df.drop(df.loc[df["timeline.incident.year"] < since].index, inplace=True)
df.drop(df.loc[df["timeline.incident.year"] > until].index, inplace=True)
stats.append(Stat("Ten Years Shape", df.shape))

for stat in stats:
    print(f"{stat.label}:\t\t\t {stat.value}")


In [None]:
# Plot y in log after date filter
plot = sns.histplot(data=df, x="timeline.incident.year", discrete=True)
plot.set(
    xlabel = "Year", 
    ylabel = "Events count ($\\log_{10}$)", 
    title = "Year distribution last ten years",
)
plot.set_xlim(2012, 2024)
xticks = list(range(2013, 2024))
plot.set_xticks(xticks)
plot.set_xticklabels(xticks)
plot.set_yscale("log")
yticks = [1,10,100,1000]
plot.set_yticks(yticks)
plot.set_yticklabels(yticks)
plot.margins(x=0.05)
display(plot)

In [None]:
# Filter not confirmed incidents
df.drop(df.loc[df["security_incident.Confirmed"] == False].index, inplace=True)
stats.append(Stat("Only Confirmed Shape", df.shape))

# Filter only incidents where a data disclosure was confirmed
df.drop(df.loc[df["attribute.confidentiality.data_disclosure.Yes"] == False].index, inplace=True)
stats.append(Stat("Only Disclosure Shape", df.shape))

for stat in stats:
    print(f"{stat.label}\t\t\t & {stat.value[0]} & {stat.value[1]} \\\\")
