<a href="https://colab.research.google.com/github/tam1444AH/UH-Insure-NSA/blob/main/notebooks/Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("output/file_metrics.csv")

# Remove a local directory from the "filename" column
df["filename"] = df["filename"].str.replace(
    "/Users/josh/SecurityAnalytics/development/", "", regex=False
)
df.info()

In [None]:
bytes_cutoff = df['bytes'].quantile(0.95)
lines_cutoff = df['lines'].quantile(0.95)

df_filtered = df[(df['bytes'] <= bytes_cutoff) & (df['lines'] <= lines_cutoff)]

plt.figure(figsize=(12,5))

# Violin plot for bytes
plt.subplot(1,2,1)
sns.violinplot(y=df_filtered['bytes'], color="steelblue", inner="quartile")
plt.title("Violin Plot of File Sizes (bytes, outliers removed)")
plt.ylabel("Bytes")

# Violin plot for lines
plt.subplot(1,2,2)
sns.violinplot(y=df_filtered['lines'], color="darkorange", inner="quartile")
plt.title("Violin Plot of File Sizes (lines, outliers removed)")
plt.ylabel("Lines")

plt.tight_layout()
plt.show()

In [None]:
# Outlier clipping thresholds
lines_cut = df['lines'].quantile(0.95)
avg_cut   = df['avg_line_len'].quantile(0.95)
max_cut   = df['max_line_len'].quantile(0.95)

df_filtered = df[
    (df['lines'] <= lines_cut) &
    (df['avg_line_len'] <= avg_cut) &
    (df['max_line_len'] <= max_cut)
]

plt.figure(figsize=(14,5))

# Violin plot for lines
plt.subplot(1,3,1)
sns.violinplot(y=df_filtered['lines'], color="skyblue", inner="quartile")
plt.title("Violin Plot: Lines (95 percentile)")

# Violin plot for avg_line_len
plt.subplot(1,3,2)
sns.violinplot(y=df_filtered['avg_line_len'], color="lightgreen", inner="quartile")
plt.title("Violin Plot: Avg Line Length (95 percentile)")

# Violin plot for max_line_len
plt.subplot(1,3,3)
sns.violinplot(y=df_filtered['max_line_len'], color="salmon", inner="quartile")
plt.title("Violin Plot: Max Line Length (95 percentile)")

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,8))
drop_columns = ['filename','sha1','num_tokens_model', 'binary_like', 'enc_hits_unicode', 'k_shingle']
corr = df.drop(columns=drop_columns).corr(numeric_only=True)
sns.heatmap(corr, cmap="coolwarm", annot=True, fmt=".2f", annot_kws={"size": 8}, center=0)
plt.title("Correlation Heatmap of Metrics")
plt.xticks(rotation=45, ha="right", fontsize=10)
plt.yticks(fontsize=10)
plt.show()

In [None]:
df.describe()

In [None]:
# StarCoder-like thresholds (tune if needed)
MAX_BYTES         = 200_000
MAX_NONASCII      = 0.20
ENC_MAX_RUN_CHARS = 1024
ENC_MAX_FRACTION  = 0.50
MAX_LINES_TOTAL   = 100_000
MAX_LINE_AVG_LEN  = 100
MAX_LINE_MAX_LEN  = 1_000
MIN_TOKENS_LANG   = 40      # language-token gate (Cryptol tokenizer)
MAX_TOKENS_LANG   = 10_000  # optional upper bound
MIN_TOKENS_MODEL  = 32      # only if you’ve populated num_tokens_model
MAX_HEXNUM_RATIO  = 0.20

In [None]:
# --- exact dedup (keep first occurrence of each sha1) ---
# mark duplicates (True means "is duplicate" => drop later)
dup_mask = df.duplicated(subset=["sha1"], keep="first")

# --- encoded data (StarCoder) ---
enc_mask = (df["enc_max_run"] > ENC_MAX_RUN_CHARS) | (df["enc_fraction"] > ENC_MAX_FRACTION)

# --- long-line filters (StarCoder) ---
longline_mask = (
    (df["lines"] > MAX_LINES_TOTAL) |
    (df["avg_line_len"] > MAX_LINE_AVG_LEN) |
    (df["max_line_len"] > MAX_LINE_MAX_LEN)
)

# --- binary-like content ---
binary_mask = df["binary_like"].fillna(False)

# --- non-ascii density ---
nonascii_mask = df["non_ascii_ratio"].fillna(0) > MAX_NONASCII

# --- size guardrail (bytes) ---
bytes_mask = df["bytes"].fillna(0) > MAX_BYTES

# --- language-token bounds ---
lang_small_mask = df["num_tokens_lang"].fillna(0) < MIN_TOKENS_LANG
lang_large_mask = df["num_tokens_lang"].fillna(0) > MAX_TOKENS_LANG

# --- shingles exist (needed for Jaccard) ---
no_shingles_mask = df["num_shingles"].fillna(0) <= 0

# --- numeric/hex blob concentration ---
hexnum_mask = df["hexnum_ratio"].fillna(0) > MAX_HEXNUM_RATIO

# --- model-token gate (only apply where available) ---
if "num_tokens_model" in df.columns:
    model_small_mask = df["num_tokens_model"].fillna(np.inf) < MIN_TOKENS_MODEL
else:
    model_small_mask = pd.Series(False, index=df.index)

In [None]:
# Combine all hard-drop reasons
drop_mask = (
    dup_mask |
    enc_mask |
    longline_mask |
    binary_mask |
    nonascii_mask |
    bytes_mask |
    lang_small_mask |
    lang_large_mask |
    no_shingles_mask |
    hexnum_mask |
    model_small_mask
)

# Optional: compute a human-readable fail reason (first rule that tripped)
def first_reason(i):
    if dup_mask.iat[i]:          return "exact_duplicate"
    if enc_mask.iat[i]:          return "encoded_data"
    if longline_mask.iat[i]:     return "long_lines"
    if binary_mask.iat[i]:       return "binary_like"
    if nonascii_mask.iat[i]:     return "too_much_nonascii"
    if bytes_mask.iat[i]:        return "too_large_bytes"
    if lang_small_mask.iat[i]:   return "too_few_lang_tokens"
    if lang_large_mask.iat[i]:   return "too_many_lang_tokens"
    if no_shingles_mask.iat[i]:  return "no_shingles"
    if hexnum_mask.iat[i]:       return "hexnum_blob"
    if model_small_mask.iat[i]:  return "too_few_model_tokens"
    return "ok"

df = df.copy()
df["quality_ok"] = ~drop_mask
df["fail_reason"] = [first_reason(i) for i in range(len(df))]

In [None]:
dedup_cols = [
    "filename", "sha1",
    # size/lines
    "bytes", "lines", "avg_line_len", "max_line_len",
    # content/encoding
    "non_ascii_ratio", "binary_like",
    "enc_total_matched", "enc_max_run", "enc_fraction",
    "enc_hits_base64", "enc_hits_hexbytes", "enc_hits_unicode",
    # tokens/shingles
    "num_tokens_lang", "k_shingle", "num_shingles", "hexnum_ratio",
    # model tokens (optional)
    "num_tokens_model",
    # path heuristic & status
    "junk_path", "quality_ok", "fail_reason",
]

candidate_df = df.loc[df["quality_ok"], dedup_cols].reset_index(drop=True)

In [None]:
print("[summary] total:", len(df))
print("[summary] kept :", int(df["quality_ok"].sum()))
print("[summary] dropped:", int((~df["quality_ok"]).sum()))
print("[summary] drop reasons:")
print(df.loc[~df["quality_ok"], "fail_reason"].value_counts())

In [None]:
candidate_df.head()

In [None]:
from preprocessing.similiar_process import run_from_dataframe

# candidate_df must have an absolute-path 'filename' column.
df_files, df_pairs, similar_files = run_from_dataframe(
    candidate_df,
    filename_col="filename",
    root_dir="/Users/josh/SecurityAnalytics/development",  # prepended to filename when opening
    out_dir="minhash_outputs",
)

In [None]:
df_files.head()

In [None]:
df_pairs.head()

In [None]:
similar_files

In [None]:
from preprocessing.cluster_process import run_clustering

# If you already have df_files/df_pairs in memory:
df_keep, df_drop, df_clusters = run_clustering(
    df_files=df_files,          # from similiar_process
    df_pairs=df_pairs,          # from similiar_process
    jaccard_keep_threshold=0.70,
    out_dir="minhash_outputs",
    content_lookup=None,        # or {filename: raw_text} if you want text-derived penalties
    save_outputs=True
)

In [None]:
from preprocessing.dataset_builder import build_datasets_from_sources

results = build_datasets_from_sources(
    metrics_csv="minhash_outputs/dedup_keep.csv",  # or minhash_files.csv
    filename_col="filename",
    root_dir="/Users/josh/SecurityAnalytics/development",  # prepended to relative filenames
    out_dir="out_datasets",
    variants="with_comments,without_comments,hybrid",

    # Agent robustness
    agent_batch_size=8,                # smaller batches help on tough files
    agent_timeout_s=45,                # fail fast if a batch hangs
    max_comment_len=4000,
    decision_cache_path="out_datasets/comment_decisions_cache.jsonl",

    # Qwen2.5-Coder-7B (4096 ctx) with prompt reserve
    context_window_tokens=4096,
    prompt_reserve_tokens=600,         # adjust to your FT prompt template
    chunk_overlap_tokens=64,
    chars_per_token=4.0,               # conservative heuristic

    # Progress cadence
    file_progress_every=20,
    save_parquet=True,
)