## Dataset overview

In [4]:
import json
import pandas as pd
import seaborn as sns
from pathlib import Path

dtypes = json.loads(Path("data/dtypes.json").read_text())
df = pd.read_csv("data/vcdb.csv", dtype=dtypes)
print(df.shape)

# Remove columns with all NA
df_clean = df.dropna(axis="columns", how="all")
print(df_clean.shape)
df_clean.head(10)


(10262, 2671)
(10262, 2603)


Unnamed: 0,action.environmental.notes,action.environmental.variety.Deterioration,action.environmental.variety.Earthquake,action.environmental.variety.EMI,action.environmental.variety.ESD,action.environmental.variety.Fire,action.environmental.variety.Flood,action.environmental.variety.Hazmat,action.environmental.variety.Humidity,action.environmental.variety.Hurricane,...,pattern.Basic Web Application Attacks,pattern.Social Engineering,pattern.System Intrusion,pattern.Denial of Service,pattern.Everything Else,actor.Multiple,Actor,victim.industry2.31_33,victim.industry2.44_45,victim.industry2.48_49
0,,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,Internal,False,False,False
1,,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,Internal,False,False,False
2,,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,External,False,False,False
3,,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,Partner,False,False,False
4,,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,External,False,False,False
5,,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,Internal,False,False,False
6,,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,Internal,False,False,False
7,,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,External,False,False,False
8,,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,External,False,False,False
9,,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,External,False,False,False


### Missing Data

### Event Counts

#### Year

In [None]:
plot = sns.histplot(data=df, x="timeline.incident.year")
plot.set(
    xlabel = "Year", 
    ylabel = "Events count ($\\log_{10}$)", 
    title = "Yearly event count",
)
plot.set_yscale("log")
ticks = [1,10,100,1000]
plot.set_yticks(ticks)
plot.set_yticklabels(ticks)
plot.margins(x=0.05)
display(plot)


#### Threat actions

In [None]:
action_names = [x for x in list(df) if x.startswith("action.") and len(x.split(".")) == 2]
df_interesting = df[["timeline.incident.year"] + action_names ]
action_counts = []
for action in action_names:
    count = df_interesting[action].value_counts()[True] 
    action_counts.append(count)
short_action_names = [x.split(".")[1] for x in action_names]
results = pd.DataFrame(data=action_counts, index=short_action_names, columns=["count"]).sort_values(by="count", ascending=False)

sns.set_theme(rc = {'figure.figsize':(8, 6)})
plot = sns.barplot(data=results, x=results.index, y="count")
plot.set(
    xlabel = "Categories", 
    ylabel = "Events count", 
    title = "Category occurrence count"
)
plot.set_xticks(plot.get_xticks()) 
plot.set_xticklabels(plot.get_xticklabels(), rotation=45)
display(plot)