# EDA and Feature Engineering for BeaconHunter

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.features import add_derived_features

plt.style.use("ggplot")

# 1) Load training data

In [None]:
df = pd.read_csv("../data/beacon_events_train.csv")
print("Raw shape:", df.shape)
display(df.head())

# 2) Basic EDA

In [None]:
print("\nData info:")
display(df.info())
print("\nMissing values per column:")
print(df.isna().sum())
print("\nLabel counts:")
print(df['label'].value_counts())

# Distribution plots

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14,5))
sns.histplot(df['bytes_out'].fillna(0), bins=50, ax=axs[0]).set_title("bytes_out")
sns.histplot(df['inter_event_seconds'].fillna(0), bins=50, ax=axs[1]).set_title("inter_event_seconds")
plt.show()

# Boxplots grouped by label

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14,5))
sns.boxplot(x='label', y='inter_event_seconds', data=df, ax=axs[0]).set_title("inter_event_seconds by label")
sns.boxplot(x='label', y='bytes_out', data=df, ax=axs[1]).set_title("bytes_out by label")
plt.show()

# Process frequency

In [None]:
plt.figure(figsize=(12,4))
df['proc_name'].value_counts().head(20).plot(kind='bar')
plt.title("Top proc_name values")
plt.show()

# 3) Add derived features (uses src.features.add_derived_features)

In [None]:
df_feat = add_derived_features(df)
print("\nAfter adding derived features:", df_feat.shape)
display(df_feat.head())


# Show descriptive stats for derived cols

In [None]:
derived_cols = ['inter_event_seconds_filled','iev_group_var','port_rarity_score','process_risk_score','geo_risk']
display(df_feat[derived_cols].describe())

# Visualize derived features vs label

In [None]:
fig, axs = plt.subplots(1,2,figsize=(14,5))
sns.boxplot(x='label', y='iev_group_var', data=df_feat, ax=axs[0]).set_title("Inter-event variability vs label")
sns.boxplot(x='label', y='port_rarity_score', data=df_feat, ax=axs[1]).set_title("Port rarity vs label")
plt.show()

# Save dataset with features for later use

In [None]:
df_feat.to_csv("../data/train_with_features.csv", index=False)
print("Saved ../data/train_with_features.csv")