In [18]:
#imports
import joblib
import pandas as pd
import numpy as np
import torch
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.base import BaseEstimator, TransformerMixin
from sentence_transformers import SentenceTransformer

In [19]:
df = pd.read_csv('../../data/analysis/emails_augmented.csv')
assert 'body_no_stopwords' in df.columns and 'label' in df.columns, "Missing required columns."
X = df['body_no_stopwords']
y = df['label']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
import nltk
print(nltk.__version__, nltk.__file__, hasattr(nltk, "data"))

3.9.1 /projectnb/rise-phishing/andrewhl/.conda/envs/phishfence/lib/python3.13/site-packages/nltk/__init__.py True


In [22]:
import torch   
torch.set_num_threads(16)
print(torch.get_num_threads())   

16


In [23]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /usr4/spclpgm/andrewhl/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [28]:
from tqdm.auto import tqdm

In [30]:
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.base import BaseEstimator, TransformerMixin
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

nltk.download('punkt',    quiet=True)
nltk.download('punkt_tab', quiet=True)

class SBERTChunkAverageTransformer(BaseEstimator, TransformerMixin):
    """
    Split documents into sentence‐chunks, enforce a 512‐token limit (incl. special tokens),
    embed them with SBERT, and average chunk embeddings per document.
    """
    def __init__(self,
                 model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
                 chunk_size: int = 5,
                 batch_size: int = 64,
                 show_progress_bar: bool = True,
                 device: str = None):

        self.model_name        = model_name
        self.chunk_size        = chunk_size
        self.batch_size        = batch_size
        self.show_progress_bar = show_progress_bar
        self.device            = device

        self.model             = None
        self.tokenizer         = None
        self._body_max_length  = None

    def _ensure_model_and_tokenizer(self):
        if self.model is None:
            self.model = SentenceTransformer(self.model_name,
                                             device=self.device)
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            special_ids = self.tokenizer.encode("", add_special_tokens=True)
            n_special   = len(special_ids)
            self._body_max_length = self.tokenizer.model_max_length - n_special

    def fit(self, X, y=None):

        self._ensure_model_and_tokenizer()
        return self

    def _split_into_token_safe_subchunks(self, text_chunk: str):
        """
        Given a text chunk, split it into pieces so that after adding special tokens
        each piece is <= tokenizer.model_max_length.
        """
        max_body = self._body_max_length
        sents = [s for s in sent_tokenize(text_chunk) if s.strip()]
        if not sents:
            return []

        out_subchunks = []
        curr_sents   = []
        curr_tokens  = 0

        for sent in sents:
            tok_ids = self.tokenizer.encode(sent, add_special_tokens=False)
            L       = len(tok_ids)

            if L > max_body:

                if curr_sents:
                    out_subchunks.append(" ".join(curr_sents))
                    curr_sents  = []
                    curr_tokens = 0

                for i in range(0, L, max_body):
                    piece_ids  = tok_ids[i : i + max_body]
                    piece_text = self.tokenizer.decode(piece_ids,
                                                       clean_up_tokenization_spaces=True)
                    out_subchunks.append(piece_text)
            else:
                if curr_tokens + L <= max_body:
                    curr_sents.append(sent)
                    curr_tokens += L
                else:

                    out_subchunks.append(" ".join(curr_sents))
                    curr_sents  = [sent]
                    curr_tokens = L


        if curr_sents:
            out_subchunks.append(" ".join(curr_sents))

        return out_subchunks

    def _chunk_and_average(self, doc: str) -> np.ndarray:
        """
        Splits a single document into safe sub‐chunks, encodes, and averages.
        """
        self._ensure_model_and_tokenizer()


        if not isinstance(doc, str) or not doc.strip():
            return np.zeros(self.model.get_sentence_embedding_dimension(),
                            dtype=np.float32)


        sentences = [s for s in sent_tokenize(doc) if s.strip()]
        if not sentences:
            return np.zeros(self.model.get_sentence_embedding_dimension(),
                            dtype=np.float32)

        initial_chunks = []
        for i in range(0, len(sentences), self.chunk_size):
            block = " ".join(sentences[i : i + self.chunk_size]).strip()
            if block:
                initial_chunks.append(block)


        safe_chunks = []
        for chunk in initial_chunks:
            safe_chunks.extend(self._split_into_token_safe_subchunks(chunk))

        if not safe_chunks:
            return np.zeros(self.model.get_sentence_embedding_dimension(),
                            dtype=np.float32)

        embs = self.model.encode(
            safe_chunks,
            batch_size=self.batch_size,
            convert_to_numpy=True,
            show_progress_bar=False
        )
        return embs.mean(axis=0)

    def transform(self, X):
        self._ensure_model_and_tokenizer()

        docs_iter = tqdm(X, total=len(X), desc="Embedding documents")
        all_embs = [
            self._chunk_and_average(doc)
            for doc in docs_iter
        ]
        return np.vstack(all_embs)


sbert_chunker = SBERTChunkAverageTransformer(batch_size=64)

X_train_emb = sbert_chunker.fit_transform(X_train)
X_test_emb  = sbert_chunker.transform(X_test)

Embedding documents:   0%|          | 22/65644 [00:00<24:45, 44.16it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (624 > 512). Running this sequence through the model will result in indexing errors
Embedding documents: 100%|██████████| 65644/65644 [30:47<00:00, 35.53it/s]   
Embedding documents: 100%|██████████| 16411/16411 [07:35<00:00, 36.03it/s]


In [32]:
import os
from joblib import dump 
output_dir = '../../output/embeddings'
os.makedirs(output_dir, exist_ok=True)
dump(X_train_emb, os.path.join(output_dir, 'X_train_emb.joblib'))
dump(X_test_emb, os.path.join(output_dir, 'X_test_emb.joblib'))
dump(y_train, os.path.join(output_dir, 'y_train.joblib'))
dump(y_test, os.path.join(output_dir, 'y_test.joblib'))

print("Embeddings saved successfully!")


Embeddings saved successfully!
