# Visualization & Diagnostics

This notebook reproduces the visualization portion of the project: data inspection, distribution plots, correlation, dimensionality reduction (PCA/t-SNE/UMAP), feature importance and model evaluation visualizations.

Notes: run top-to-bottom. The notebook uses sampled data for heavy operations (PCA/t-SNE/UMAP/SHAP) to remain interactive. Figures are saved to `results/figs/`.

In [None]:
# Imports and setup
import os
from pathlib import Path
import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, precision_recall_curve, confusion_matrix, RocCurveDisplay, PrecisionRecallDisplay
import joblib

# Optional imports for UMAP/plotly/SHAP
try:
    import umap
except Exception:
    umap = None
try:
    import plotly.express as px
    import plotly.graph_objects as go
except Exception:
    px = None; go = None

ROOT = Path('..').resolve().parents[0] if False else Path(__file__).resolve().parents[2] if '__file__' in globals() else Path('c:/ECHO/Projects/Personal_Projects/Fruty')
ROOT = Path('c:/ECHO/Projects/Personal_Projects/Fruty')
DATA_PATH = ROOT / 'combine.csv'
RESULTS = ROOT / 'results'
FIG_DIR = RESULTS / 'figs'
FIG_DIR.mkdir(parents=True, exist_ok=True)
sns.set(style='whitegrid')

In [None]:
# Load permutation checks results (if available) and print top features
checks_path = RESULTS / 'catboost_checks.json'
perm_top = []
if checks_path.exists():
    js = json.loads(checks_path.read_text())
    perm_top = js.get('top_perm_features', js.get('top_permutation_features') or js.get('top_features') or js.get('top_features_perm', []))
    # catboost_checks from earlier stored 'Top perm features' in summary; try fallback
    if not perm_top and 'permutation_importance' in js:
        # expect list of (feature, score)
        perm = js.get('permutation_importance')
        perm_top = [f for f,s in perm][:10]
    print('Loaded catboost_checks.json, top perm features (sample):', perm_top[:10])
else:
    print('No catboost_checks.json found; top features will be inferred from model or variance.')

In [None]:
# Load final model bundle
model_path = ROOT / 'models' / 'final_detector.joblib'
if not model_path.exists():
    raise FileNotFoundError(f'Final model not found at {model_path}')
bundle = joblib.load(model_path)
clf = bundle.get('catboost') or bundle.get('model') or bundle.get('clf')
cat_features = bundle.get('cat_features', [])
threshold = float(bundle.get('threshold', 0.5))
print('Loaded model:', type(clf).__name__, 'threshold=', threshold)

In [None]:
# Utility: read a sampled chunk-friendly DataFrame (sample up to n rows)
def load_sample_csv(path, n=100000, random_state=42):
    # try fast read + sample if too large
    # if file smaller than n*2, load full
    import os
    try:
        total = os.path.getsize(path)
    except Exception:
        total = None
    # load with pandas but only first n*10 rows if size large to speed up
    df = pd.read_csv(path, low_memory=False)
    if len(df) > n:
        return df.sample(n, random_state=random_state).reset_index(drop=True)
    return df

print('Loading sample (may take a moment)')
t0 = time.time()
df_sample = load_sample_csv(DATA_PATH, n=100000)
print('Loaded sample rows=', len(df_sample), 'in', round(time.time()-t0,2),'s')

In [None]:
# Quick data checks
label_candidates = ['label','target','class','attack']
label_col = None
for c in label_candidates:
    if c in df_sample.columns:
        label_col = c
        break
if label_col is None:
    label_col = df_sample.columns[-1]
print('Using label column:', label_col)
print('Shape:', df_sample.shape)
display(df_sample.head())
# missing value summary
print('
Missing values (top cols):')
print(df_sample.isna().sum().sort_values(ascending=False).head(10))
# class distribution
print('
Class counts:')
print(df_sample[label_col].value_counts().head(20))

In [None]:
# Map labels to binary (re-implement small mapping here)
def map_to_binary(yarr):
    y = np.array(yarr)
    svals = [str(v).lower() for v in y[:1000]] if y.size>0 else []
    if any('normal' in s for s in svals):
        return np.array([0 if 'normal' in str(v).lower() else 1 for v in y], dtype=int)
    try:
        yi = y.astype(int)
        maj = int(np.argmax(np.bincount(yi)))
        return np.array([0 if int(v)==maj else 1 for v in yi], dtype=int)
    except Exception:
        try:
            yf = y.astype(float)
            return np.array([0 if float(v)==0.0 else 1 for v in yf], dtype=int)
        except Exception:
            first = y[0]
            return np.array([0 if v==first else 1 for v in y], dtype=int)

y_sample = map_to_binary(df_sample[label_col].values)
df_sample['__y'] = y_sample
print('Binary mapping: unique values ->', np.unique(y_sample))

## Class / attack distribution visualizations

In [None]:
# Bar chart + pie chart for class distribution
plt.figure(figsize=(8,4))
ax = sns.countplot(x='__y', data=df_sample)
ax.set_title('Binary class counts (sample)')
plt.savefig(FIG_DIR / 'class_counts.png', bbox_inches='tight', dpi=150)
plt.show()

if px is not None:
    try:
        fig = px.pie(df_sample, names='__y', title='Class distribution (sample)')
        fig.write_html(str(FIG_DIR / 'class_distribution_pie.html'))
        print('Wrote interactive pie to', FIG_DIR / 'class_distribution_pie.html')
    except Exception as e:
        print('Plotly pie failed:', e)

## Feature distributions for top permutation features

In [None]:
# choose top features (from perm_top if available)
if perm_top:
    top_feats = perm_top[:10]
else:
    # fallback: pick numeric cols with highest variance
    numcols = df_sample.select_dtypes(include=[np.number]).columns.tolist()
    varr = df_sample[numcols].var().sort_values(ascending=False)
    top_feats = varr.index[:10].tolist()

print('Top features for distribution plots:', top_feats)

for f in top_feats:
    plt.figure(figsize=(8,4))
    if pd.api.types.is_numeric_dtype(df_sample[f]):
        sns.kdeplot(data=df_sample, x=f, hue='__y', common_norm=False, fill=True)
        plt.title(f'Density plot of {f} by class')
    else:
        sns.countplot(y=f, data=df_sample, order=df_sample[f].value_counts().index[:20])
        plt.title(f'Value counts for {f}')
    plt.tight_layout()
    outp = FIG_DIR / f'feature_dist_{f.replace(' ', '_').strip()}.png'
    plt.savefig(outp, dpi=150, bbox_inches='tight')
    plt.show()

## Correlation matrix

In [None]:
numcols = df_sample.select_dtypes(include=[np.number]).columns.tolist()
corr = df_sample[numcols].corr()
plt.figure(figsize=(12,10))
sns.clustermap(corr, cmap='vlag', linewidths=.5, figsize=(12,12))
plt.title('Feature correlation clustermap')
plt.savefig(FIG_DIR / 'correlation_clustermap.png', dpi=150, bbox_inches='tight')
# save top correlations to CSV
corr_pairs = corr.abs().unstack().sort_values(ascending=False).drop_duplicates()
corr_pairs = corr_pairs[corr_pairs < 1.0] # drop self correlations
corr_pairs.head(20).to_csv(FIG_DIR / 'top_correlations.csv')
print('Saved top correlations to', FIG_DIR / 'top_correlations.csv')

## Pairplot of top features (small sample)

In [None]:
pair_sample = df_sample.sample(n=2000, random_state=42) if len(df_sample)>2000 else df_sample
sns.pairplot(pair_sample[top_feats[:5]+['__y']], hue='__y', plot_kws={'alpha':0.5})
plt.savefig(FIG_DIR / 'pairplot_top_features.png', dpi=150, bbox_inches='tight')
plt.show()

## Feature importance (model-based)

In [None]:
# Train a simple RandomForest on a sample to compute feature importances (fast)
feat_sample = df_sample.sample(n=50000, random_state=42) if len(df_sample)>50000 else df_sample
X = feat_sample.select_dtypes(include=[np.number]).fillna(0)
y = feat_sample['__y']
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
rf.fit(X, y)
imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(8,6))
imp[:30].plot(kind='bar')
plt.title('RandomForest feature importances (sample)')
plt.tight_layout()
plt.savefig(FIG_DIR / 'rf_feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

## PCA projection (2D & 3D)

In [None]:
# PCA on scaled numeric features
scaler = StandardScaler()
Xnum = df_sample.select_dtypes(include=[np.number]).fillna(0)
Xscaled = scaler.fit_transform(Xnum)
pca = PCA(n_components=3, random_state=42)
Xp = pca.fit_transform(Xscaled)
print('Explained variance ratios (3 comps):', pca.explained_variance_ratio_)
plt.figure(figsize=(8,6))
sns.scatterplot(x=Xp[:,0], y=Xp[:,1], hue=df_sample['__y'], alpha=0.6, palette='Set1')
plt.title(f'PCA 2D (explained var: {pca.explained_variance_ratio_[:2]})')
plt.savefig(FIG_DIR / 'pca_2d.png', dpi=150, bbox_inches='tight')
plt.show()
# interactive Plotly version (if available)
if px is not None:
    fig = px.scatter_3d(x=Xp[:,0], y=Xp[:,1], z=Xp[:,2], color=df_sample['__y'].astype(str), title='PCA 3D')
    fig.write_html(str(FIG_DIR / 'pca_3d.html'))
    print('Wrote PCA 3D interactive to', FIG_DIR / 'pca_3d.html')

## t-SNE and UMAP projections (subsample)

In [None]:
sub = df_sample.sample(n=10000, random_state=42) if len(df_sample)>10000 else df_sample
Xsub = sub.select_dtypes(include=[np.number]).fillna(0)
Xs = StandardScaler().fit_transform(Xsub)
t0 = time.time()
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000, verbose=1)
emb = tsne.fit_transform(Xs)
print('t-SNE took', round(time.time()-t0,2),'s')
plt.figure(figsize=(8,6))
sns.scatterplot(x=emb[:,0], y=emb[:,1], hue=sub['__y'], alpha=0.6, palette='Set1')
plt.title('t-SNE (10k sample)')
plt.savefig(FIG_DIR / 'tsne_2d.png', dpi=150, bbox_inches='tight')
plt.show()
if umap is not None:
    reducer = umap.UMAP(n_components=2, random_state=42)
    emb_u = reducer.fit_transform(Xs)
    plt.figure(figsize=(8,6))
    sns.scatterplot(x=emb_u[:,0], y=emb_u[:,1], hue=sub['__y'], alpha=0.6, palette='Set1')
    plt.title('UMAP (10k sample)')
    plt.savefig(FIG_DIR / 'umap_2d.png', dpi=150, bbox_inches='tight')
    plt.show()

## Confusion matrix, ROC and Precision-Recall curves

In [None]:
# Use the model to predict probabilities on the sample test split (or full if available)
# Here we reuse df_sample as a proxy; in production use reserved test set.
X_eval = df_sample.select_dtypes(include=[np.number]).fillna(0)
try:
    proba = clf.predict_proba(X_eval)[:,1]
except Exception:
    proba = clf.predict_proba(X_eval.values)[:,1]
pred = (proba >= threshold).astype(int)
cm = confusion_matrix(df_sample['__y'], pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion matrix (sample)')
plt.savefig(FIG_DIR / 'confusion_matrix_sample.png', dpi=150, bbox_inches='tight')
plt.show()
fpr, tpr, _ = roc_curve(df_sample['__y'], proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.4f}')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (sample)')
plt.legend()
plt.savefig(FIG_DIR / 'roc_curve_sample.png', dpi=150, bbox_inches='tight')
plt.show()
precision, recall, _ = precision_recall_curve(df_sample['__y'], proba)
plt.figure(figsize=(6,5))
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall (sample)')
plt.savefig(FIG_DIR / 'pr_curve_sample.png', dpi=150, bbox_inches='tight')
plt.show()

## SHAP explanations (optional)

In [None]:
try:
    import shap
    shap_available = True
except Exception as e:
    shap_available = False
    print('shap not available. To install: conda activate fruty-catboost && pip install shap')

if shap_available:
    # compute shap on a small sample
    sample_shap = df_sample.sample(n=10000, random_state=42) if len(df_sample)>10000 else df_sample
    Xsh = sample_shap.select_dtypes(include=[np.number]).fillna(0)
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(Xsh)
    shap.summary_plot(shap_values, Xsh, show=False)
    plt.savefig(FIG_DIR / 'shap_summary.png', dpi=150, bbox_inches='tight')
    print('Saved SHAP summary to', FIG_DIR / 'shap_summary.png')

## Export & wrap-up
Figures saved under `results/figs/`.
Next steps: inspect top feature distributions (e.g., Destination Port) for leakage, run SHAP if available, and run full K-fold CV for final validation.