## Descriptive Statistics 

In [None]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from helper import add_units_dates, concatenate_actions

dtypes = json.loads(Path("data/dtypes.json").read_text())
df = pd.read_csv("data/vcdb.csv", dtype=dtypes)

# Remove columns with only empty values
df.dropna(axis="columns", how="all", inplace=True)

# Filter old incidents
since = 2013
until = 2024
df.drop(df.loc[df["timeline.incident.year"] < since].index, inplace=True)
df.drop(df.loc[df["timeline.incident.year"] > until].index, inplace=True)

# Add column with incident date
df["timeline.incident.date"] = pd.to_datetime(
    df.rename(columns={
        "timeline.incident.year": "year", 
        "timeline.incident.month": "month", 
        "timeline.incident.day": "day"}
        )
        [["year", "month", "day"]], errors="coerce")

# Add columns for milestones dates 
stages = ["discovery", "compromise", "exfiltration", "containment" ]
for stage in stages:
    df[f"timeline.{stage}.date"] = df.apply(add_units_dates, args=(stage,), axis=1)

# Add column for Action
action_names = [x for x in list(df) if x.startswith("action.") and len(x.split(".")) == 2]
df["Actions"] = df.apply(concatenate_actions, args=(action_names,), axis=1)


#### Year

In [None]:
# Plot the count of incidents per year
frequency_distribution = df["timeline.incident.year"].value_counts().reset_index()
frequency_distribution.columns = ["Year", "Count"]

plot = sns.barplot(x="Year", y="Count", data=frequency_distribution)
plot.set(
    xlabel = "Count", 
    ylabel = "Year", 
    title = "Incidents Coutn per Year",
)

plt.show()

### Frequency distribution

#### Threat Action

In [None]:
df[action_names].describe()

In [None]:
# Calculate frequency distribution of actions for each column
action_frequency = df.lo[action_names].sum()

# Convert to DataFrame for better readability
df_action_frequency = action_frequency.sort_values(axis="index", ascending=False).reset_index()
df_action_frequency.columns = ["Action", "Count"]

plot = sns.barplot(x="Count", y="Action", data=df_action_frequency)
plot.set(
    xlabel = "Count", 
    ylabel = "Threat Action", 
    title = "Frequency Distribution of Threat Actions",
)

plt.show()

In [None]:
# Calculate the frequency distribution of actions for the combined values
frequency_distribution = df["Actions"].value_counts().reset_index()
frequency_distribution.columns = ["Actions", "count"]


plot = sns.barplot(x="count", y="Actions", data=frequency_distribution.head(20))
plot.set(
    xlabel = "Count", 
    ylabel = "Threat Action (Top 20)", 
    title = "Top 20 Threat Actions by Frequency",
)

plt.show()

#### Industry Names

In [None]:
# Calculate the frequency distribution of industry names
frequency_distribution = df["victim.industry.name"].value_counts().reset_index()
frequency_distribution.columns = ["Industry", "count"]

plot = sns.barplot(x="count", y="Industry", data=frequency_distribution)
plot.set(
    xlabel = "Count", 
    ylabel = "Industry Name", 
    title = "Frequency Distribution of Industry",
)

plt.show()