In [2]:
!pip install --upgrade pip
!pip install openai-whisper fasttext torch torchvision torchaudio nltk scikit-learn

import os
import whisper
import fasttext
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# Ensure GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m9.6 MB/s[0m  [33m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25h

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using device: cpu


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
data_dir = '/content/drive/MyDrive/PhishingVoiceDataset'   # contains subdirs 'phishing/' and 'antiphishing/'
labels_map = {'NonPhishing': 0, 'Phishing': 1}

filepaths, labels = [], []
for cls, idx in labels_map.items():
    folder = os.path.join(data_dir, cls)
    for f in os.listdir(folder):
        if f.lower().endswith('.mp3'):
            filepaths.append(os.path.join(folder, f))
            labels.append(idx)

# Transcribe via Whisper
model_whisper = whisper.load_model("base")
texts = []
for path in filepaths:
    res = model_whisper.transcribe(path)
    texts.append(res['text'])

print(f"Transcribed {len(texts)} audio files.")


100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 105MiB/s]


Transcribed 80 audio files.


In [8]:
import nltk
nltk.download('punkt_tab')

# Cell 3: Text Cleaning & Tokenization
stop_words = set(stopwords.words('english'))

def clean(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w.isalpha() and w not in stop_words]
    return " ".join(tokens)

clean_texts = [clean(t) for t in texts]
labels = np.array(labels)
print("Sample cleaned text:", clean_texts[0])

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Sample cleaned text: received request update zenra account details please allow hours changes take effect


In [11]:
# Cell 4: FastText Embedding Matrix (fixed)
from huggingface_hub import hf_hub_download

# Download the official English FastText .bin from Hugging Face
fasttext_bin = hf_hub_download(
    repo_id="facebook/fasttext-en-vectors",
    filename="model.bin"
)

# Load it
ft_model = fasttext.load_model(fasttext_bin)
print("Loaded FastText with vocab size:", len(ft_model.words))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.bin:   0%|          | 0.00/7.24G [00:00<?, ?B/s]

Loaded FastText with vocab size: 2000000


In [9]:
# Cell 5: Train/Test Split (70/30 Stratified) + Validation (15% of train)
X_train, X_test, y_train, y_test = train_test_split(
    clean_texts, labels,
    test_size=0.30,
    random_state=42,
    stratify=labels
)

# Optional: carve out 15% of train for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train,
    test_size=0.1765,  # 0.1765 of 70% ≈ 15% of total
    random_state=42,
    stratify=y_train
)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")


Train: 46, Val: 10, Test: 24


In [12]:
# Cell 6: Vocabulary & Embedding Matrix Construction
from collections import Counter

# Build vocab on X_train
counter = Counter()
for txt in X_train:
    counter.update(txt.split())

vocab = {w: i+1 for i, (w, _) in enumerate(counter.items())}
vocab_size = len(vocab) + 1
emb_dim = 300

# Create embedding matrix
emb_matrix = np.zeros((vocab_size, emb_dim), dtype=np.float32)
for w, idx in vocab.items():
    emb_matrix[idx] = ft_model.get_word_vector(w)

print(f"Vocab size: {vocab_size}")


Vocab size: 153


In [13]:
# Cell 7: Dataset & DataLoader Definitions
max_len = 100

class PhishDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        toks = [vocab.get(w,0) for w in self.texts[i].split()]
        toks = (toks + [0]*max_len)[:max_len]
        return torch.tensor(toks, dtype=torch.long), torch.tensor(self.labels[i], dtype=torch.float32)

train_ds = PhishDataset(X_train, y_train)
val_ds   = PhishDataset(X_val,   y_val)
test_ds  = PhishDataset(X_test,  y_test)

batch_size = 8
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size)
test_loader  = DataLoader(test_ds,  batch_size=batch_size)


In [14]:
# Cell 8: Model Definition (CNN–BiLSTM + Attention)
class Attention(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.w = nn.Parameter(torch.randn(dim))
    def forward(self, x):
        # x: [B, T, F]
        scores = torch.tensordot(x, self.w, dims=([2],[0]))  # [B, T]
        attn   = torch.softmax(scores, dim=1).unsqueeze(-1)   # [B, T, 1]
        return torch.sum(x * attn, dim=1)                    # [B, F]

class PhishModel(nn.Module):
    def __init__(self, emb_matrix):
        super().__init__()
        num_emb, emb_dim = emb_matrix.shape
        self.embed = nn.Embedding(num_emb, emb_dim, padding_idx=0)
        self.embed.weight.data.copy_(torch.from_numpy(emb_matrix))
        self.embed.weight.requires_grad = False

        self.conv = nn.Conv1d(emb_dim, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(2)
        self.lstm = nn.LSTM(64, 128, bidirectional=True, batch_first=True)
        self.attn = Attention(128*2)
        self.fc1  = nn.Linear(128*2, 128)
        self.drop = nn.Dropout(0.05)
        self.fc2  = nn.Linear(128, 1)

    def forward(self, x):
        x = self.embed(x)          # [B, T, E]
        x = x.transpose(1,2)       # [B, E, T]
        x = torch.relu(self.conv(x))
        x = self.pool(x)           # [B, 64, T/2]
        x = x.transpose(1,2)       # [B, T/2, 64]
        out, _ = self.lstm(x)      # [B, T/2, 256]
        ctx    = self.attn(out)    # [B, 256]
        h      = torch.relu(self.fc1(ctx))
        h      = self.drop(h)
        return torch.sigmoid(self.fc2(h)).squeeze(1)

model = PhishModel(emb_matrix).to(device)
print(model)


PhishModel(
  (embed): Embedding(153, 300, padding_idx=0)
  (conv): Conv1d(300, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (lstm): LSTM(64, 128, batch_first=True, bidirectional=True)
  (attn): Attention()
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (drop): Dropout(p=0.05, inplace=False)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)


In [15]:
# Cell 9: Optimizer, Loss, and Training Loop
learning_rate = 5e-4
weight_decay  = 1e-5
epochs        = 20
patience      = 3

optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.BCELoss()

best_val_loss = float('inf')
trials = 0

for epoch in range(1, epochs+1):
    # Train
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss  = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    train_loss = total_loss / len(train_loader.dataset)

    # Validate
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            loss  = criterion(preds, yb)
            total_val_loss += loss.item() * xb.size(0)
    val_loss = total_val_loss / len(val_loader.dataset)

    print(f"Epoch {epoch:02d}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

    # Early Stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trials = 0
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        trials += 1
        if trials >= patience:
            print("Early stopping at epoch", epoch)
            break

# Load best weights
model.load_state_dict(torch.load('best_model.pt'))


Epoch 01: Train Loss = 0.6940, Val Loss = 0.6931
Epoch 02: Train Loss = 0.6933, Val Loss = 0.6931
Epoch 03: Train Loss = 0.6933, Val Loss = 0.6930
Epoch 04: Train Loss = 0.6935, Val Loss = 0.6926
Epoch 05: Train Loss = 0.6920, Val Loss = 0.6910
Epoch 06: Train Loss = 0.6870, Val Loss = 0.6856
Epoch 07: Train Loss = 0.6754, Val Loss = 0.6702
Epoch 08: Train Loss = 0.6498, Val Loss = 0.6419
Epoch 09: Train Loss = 0.6015, Val Loss = 0.6000
Epoch 10: Train Loss = 0.5147, Val Loss = 0.5144
Epoch 11: Train Loss = 0.3698, Val Loss = 0.3986
Epoch 12: Train Loss = 0.2018, Val Loss = 0.2806
Epoch 13: Train Loss = 0.0723, Val Loss = 0.5780
Epoch 14: Train Loss = 0.0090, Val Loss = 0.5937
Epoch 15: Train Loss = 0.0479, Val Loss = 0.6081
Early stopping at epoch 15


<All keys matched successfully>

In [16]:
# Cell 10: Testing & Classification Report
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        preds = model(xb).cpu().numpy()
        all_preds.extend((preds > 0.5).astype(int).tolist())
        all_labels.extend(yb.tolist())

print(classification_report(all_labels, all_preds, target_names=['AntiPhishing','Phishing']))


              precision    recall  f1-score   support

AntiPhishing       0.80      1.00      0.89        12
    Phishing       1.00      0.75      0.86        12

    accuracy                           0.88        24
   macro avg       0.90      0.88      0.87        24
weighted avg       0.90      0.88      0.87        24



In [17]:
# Cell 11: Save Model & Report
torch.save(model.state_dict(), 'voice_phishing_pytorch.pt')

# Optional: save report to file
report = classification_report(all_labels, all_preds, target_names=['AntiPhishing','Phishing'])
with open('classification_report.txt','w') as f:
    f.write(report)

from google.colab import files
files.download('voice_phishing_pytorch.pt')
files.download('classification_report.txt')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
# === Before your evaluation cell ===

# 1. Load the saved checkpoint
checkpoint_path = 'voice_phishing_pytorch.pt'
state_dict = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(state_dict)                             # :contentReference[oaicite:0]{index=0}

# 2. Move model back to GPU (if needed) and set to eval mode
model.to(device)
model.eval()


PhishModel(
  (embed): Embedding(153, 300, padding_idx=0)
  (conv): Conv1d(300, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (lstm): LSTM(64, 128, batch_first=True, bidirectional=True)
  (attn): Attention()
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (drop): Dropout(p=0.05, inplace=False)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)

In [19]:
# Now run your test loader loop exactly as before:
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        preds = model(xb).cpu().numpy()
        all_preds.extend((preds > 0.5).astype(int).tolist())
        all_labels.extend(yb.tolist())

print(classification_report(all_labels, all_preds, target_names=['AntiPhishing','Phishing']))


              precision    recall  f1-score   support

AntiPhishing       0.80      1.00      0.89        12
    Phishing       1.00      0.75      0.86        12

    accuracy                           0.88        24
   macro avg       0.90      0.88      0.87        24
weighted avg       0.90      0.88      0.87        24



In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
# Save vocabulary for production use
import pickle
import numpy as np
import torch
from pathlib import Path

# Create directory in Google Drive
model_dir = '/content/drive/MyDrive/phishing_model'
Path(model_dir).mkdir(parents=True, exist_ok=True)

# Save vocabulary
vocab_path = f'{model_dir}/vocab.pkl'
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
print(f"Vocabulary saved to {vocab_path}")

# Save embedding matrix
emb_matrix_path = f'{model_dir}/emb_matrix.npy'
np.save(emb_matrix_path, emb_matrix)
print(f"Embedding matrix saved to {emb_matrix_path}")

# Model weights are already saved as voice_phishing_pytorch.pt
# Move it to the model directory
import shutil
shutil.move('voice_phishing_pytorch.pt', f'{model_dir}/voice_phishing_pytorch.pt')
print(f"Model weights saved to {model_dir}/voice_phishing_pytorch.pt")

Vocabulary saved to /content/drive/MyDrive/phishing_model/vocab.pkl
Embedding matrix saved to /content/drive/MyDrive/phishing_model/emb_matrix.npy
Model weights saved to /content/drive/MyDrive/phishing_model/voice_phishing_pytorch.pt


In [21]:
# Save the exact model architecture for reproduction
import json

model_architecture = {
    "name": "PhishModel",
    "embedding_dim": 300,
    "conv_filters": 64,
    "lstm_units": 128,
    "attention_dim": 256,
    "dense_units": 128,
    "dropout_rate": 0.05,
    "max_length": 100,
    "vocab_size": vocab_size
}

architecture_path = f'{model_dir}/model_architecture.json'
with open(architecture_path, 'w') as f:
    json.dump(model_architecture, f, indent=2)
print(f"Model architecture saved to {architecture_path}")

Model architecture saved to /content/drive/MyDrive/phishing_model/model_architecture.json


In [22]:
# Test that everything loads correctly
print("Testing model artifact loading...")

# Load vocabulary
with open(vocab_path, 'rb') as f:
    loaded_vocab = pickle.load(f)
print(f"✓ Vocabulary loaded: {len(loaded_vocab)} words")

# Load embedding matrix
loaded_emb_matrix = np.load(emb_matrix_path)
print(f"✓ Embedding matrix loaded: {loaded_emb_matrix.shape}")

# Load model
# from src.models.phish_model import PhishModel  # We'll create this later
test_model = PhishModel(loaded_emb_matrix)
test_model.load_state_dict(torch.load(f'{model_dir}/voice_phishing_pytorch.pt'))
print("✓ Model weights loaded successfully")

# Test prediction
test_text = "urgent bank account verification required immediately"
test_tokens = [loaded_vocab.get(w, 0) for w in test_text.split()]
test_tokens = (test_tokens + [0]*100)[:100]
test_input = torch.tensor(test_tokens).unsqueeze(0)

with torch.no_grad():
    prediction = test_model(test_input)
    print(f"✓ Test prediction: {prediction.item():.3f}")

Testing model artifact loading...
✓ Vocabulary loaded: 152 words
✓ Embedding matrix loaded: (153, 300)


ModuleNotFoundError: No module named 'src'