## Anomalies and temporal filtering

In [None]:
import json
import pandas as pd
import seaborn as sns
from pathlib import Path

dtypes = json.loads(Path("data/dtypes.json").read_text())
df = pd.read_csv("data/vcdb.csv", dtype=dtypes)
original_shape = df.shape

# Remove columns with only empty values
df.dropna(axis="columns", inplace=True)
no_NAN_columns_shape = df.shape

# Filter old incidents
since = 2013
until = 2024
df.drop(df.loc[df['timeline.incident.year'] < since].index, inplace=True)
df.drop(df.loc[df['timeline.incident.year'] > until].index, inplace=True)
print(f"""
      Original Shape:\t{original_shape}
      No NaN Shape:\t{no_NAN_columns_shape}
      Final Shape:\t{df.shape}""")


In [None]:
plot = sns.histplot(data=df, x="timeline.incident.year")
plot.set(
    xlabel = "Year", 
    ylabel = "Events count ($\\log_{10}$)", 
    title = "Year distribution last ten years",
)
plot.set_yscale("log")
ticks = [1,10,100,1000]
plot.set_yticks(ticks)
plot.set_yticklabels(ticks)
plot.margins(x=0.05)
display(plot)

In [None]:
sns.set_theme(rc = {'figure.figsize':(20, 8)})
plot = sns.countplot(data=df, x="timeline.incident.year")
plot.set(
    xlabel = "Year", 
    ylabel = "Events count", 
    title = "Yearly event count",
)
plot.set_xticks(plot.get_xticks()) 
plot.set_xticklabels(plot.get_xticklabels(), rotation=45)
display(plot)