In [None]:
!pip install "modin[all]" 
!pip install swifter
!pip install swifter[notebook]
!pip install swifter[groupby]

In [None]:
import modin.pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Đọc dữ liệu
# df = pd.read_csv("/kaggle/input/phishing-email-dataset/phishing_email.csv")
df = pd.read_excel("/kaggle/input/phishing-email-vietnamese-v2-excel/phishing_email_with_translation_clean_small.xlsx")
df.info()

df['length'] = df['translation'].apply(lambda x: len(x.split()))

sns.set_style("whitegrid")

plt.figure(figsize=(8, 6))
plt.pie(df['label'].value_counts(), labels=['ham (not spam)', 'spam'], autopct='%1.1f%%')
plt.title('Tỷ lệ các loại Email', fontsize=14)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))
sns.violinplot(x='label', y='length', data=df, palette='viridis')
plt.title('Phân bố số lượng từ trong Email theo nhãn', fontsize=14)
plt.xlabel('Nhãn', fontsize=12)
plt.ylabel('Số lượng từ trong Email', fontsize=12)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))
ax = sns.countplot(x='label', data=df, palette='viridis')
for container in ax.containers:
    ax.bar_label(container)
plt.title('Phân bố các loại Email', fontsize=14)
plt.xlabel('Nhãn', fontsize=12)
plt.ylabel('Số lượng Email', fontsize=12)
plt.xticks(ticks=[0, 1], labels=['ham (not spam)', 'spam'])
plt.tight_layout()
plt.show()

# Lưu các biểu đồ
plt.figure(figsize=(8, 6))
plt.pie(df['label'].value_counts(), labels=['ham (not spam)', 'spam'], autopct='%1.1f%%')
plt.title('Tỷ lệ các loại Email', fontsize=14)
plt.tight_layout()
plt.savefig('email_piechart.png')
plt.close()

plt.figure(figsize=(8, 6))
sns.violinplot(x='label', y='length', data=df, palette='viridis')
plt.title('Phân bố số lượng từ trong Email theo nhãn', fontsize=14)
plt.xlabel('Nhãn', fontsize=12)
plt.ylabel('Số lượng từ trong Email', fontsize=12)
plt.tight_layout()
plt.savefig('email_length_distribution.png')
plt.close()

plt.figure(figsize=(8, 6))
ax = sns.countplot(x='label', data=df, palette='viridis')
for container in ax.containers:
    ax.bar_label(container)
plt.title('Phân bố các loại Email', fontsize=14)
plt.xlabel('Nhãn', fontsize=12)
plt.ylabel('Số lượng Email', fontsize=12)
plt.xticks(ticks=[0, 1], labels=['ham (not spam)', 'spam'])
plt.tight_layout()
plt.savefig('email_distribution.png')
plt.close()

In [None]:
import os
import re
import unicodedata
import gc
import json
import random
import joblib

from joblib import Parallel, delayed
from collections import Counter

import modin.pandas as pd
import swifter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from tqdm.auto import tqdm
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

EMBEDDING_DIM     = 200
HIDDEN_DIM        = 128
NUM_HEADS         = 4
NUM_FILTERS       = 200
KERNEL_SIZES      = [2]
DROPOUT           = 0.2
LR                = 1e-3
BATCH_SIZE        = 16
NUM_EPOCHS        = 30
MAX_LEN           = 256
PAD_TOKEN         = "<pad>"
UNK_TOKEN         = "<unk>"
TEST_SPLIT_RATIO  = 0.2
FREEZE_EMBEDDINGS = False
PATIENCE          = 5

BEST_MODEL_FILENAME = 'best_email_classifier.pt'
ARTIFACTS_DIR       = 'artifacts'
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

W2V_WINDOW    = 15
W2V_MIN_COUNT = 10
W2V_WORKERS   = os.cpu_count()
W2V_RUNS      = 1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

def normalize_text(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize('NFKC', text).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    return re.sub(r'\s+', ' ', text).strip()

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')).union({
    'subject','re','fw','fwd','sent','from','to','cc','bcc',
    'http','https','www','com','org','net','am','pm',
    'dear','hello','hi','thanks','best','regards',
    'please','mail','email','message','list','address','nbsp'
})

def preprocess_data(filepath, max_len):
    print("Start preprocessing data...")
    df = pd.read_csv(filepath)
    df = df.dropna(subset=['text_combined']).reset_index(drop=True)
    gc.collect()

    def tokenize_and_lemmatize(text):
        toks = word_tokenize(text)
        return toks

    print(df['text_combined'])
    texts = df['text_combined'].tolist()
    sentences = Parallel(n_jobs=W2V_WORKERS, backend="threading")(  
        delayed(tokenize_and_lemmatize)(text) for text in tqdm(texts, desc="Tokenizing texts", total=len(texts))
    )
    toks_for_w2v = sentences
    raw_labels = df['label'].astype(str).str.strip().tolist()
    print("Preprocessing completed.")

    all_words = Counter([w for sent in toks_for_w2v for w in sent])
    vocab = {w: i+2 for i, w in enumerate(all_words.keys())}
    vocab[PAD_TOKEN] = 0
    vocab[UNK_TOKEN] = 1
    unique_labels = sorted(set(raw_labels))
    label_map    = {lbl: i for i, lbl in enumerate(unique_labels)}
    idx_to_label = {i: lbl for lbl, i in label_map.items()}

    X, Y = [], []
    for toks, lbl in zip(sentences, raw_labels):
        nums = [vocab.get(t, vocab[UNK_TOKEN]) for t in toks]
        if len(nums) < max_len:
            nums += [vocab[PAD_TOKEN]] * (max_len - len(nums))
        else:
            nums = nums[:max_len]
        X.append(nums)
        Y.append(label_map[lbl])
    return X, Y, toks_for_w2v, vocab, label_map, idx_to_label

def train_word2vec_avg(tokenized_sentences, embedding_dim, window, min_count, runs=1):
    print("\nStart Word2Vec training...")
    tokenized_sentences = list(tokenized_sentences)
    sum_vec, keys = None, None
    cpu_cores = os.cpu_count()
    for _ in tqdm(range(runs), desc="Word2Vec runs"):
        model = Word2Vec(
            sentences=tokenized_sentences,
            vector_size=embedding_dim,
            window=window,
            min_count=min_count,
            workers=cpu_cores,
            sg=1,
            epochs=5,
            batch_words=10000
        )
        vecs = model.wv.vectors
        if sum_vec is None:
            sum_vec = vecs.astype(np.float32)
            keys = model.wv.index_to_key
        else:
            sum_vec += vecs
    avg_vec = sum_vec / runs
    kv = KeyedVectors(vector_size=embedding_dim)
    kv.add_vectors(keys, avg_vec)
    print("Word2Vec training completed.")
    return kv

def create_embedding_matrix(kv, vocab, embedding_dim):
    print("Creating embedding matrix...")
    mat = np.zeros((len(vocab), embedding_dim))
    cnt = 0
    for w, idx in vocab.items():
        if w in kv:
            mat[idx] = kv[w]
            cnt += 1
    print(f"Initialized {cnt}/{len(vocab)} tokens.")
    return torch.tensor(mat, dtype=torch.float)

class SentimentDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.Y[idx], dtype=torch.long)

class CNNBiLSTM_MHA(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, kernel_sizes,
                 hidden_dim, num_heads, output_dim, dropout, pad_idx,
                 pretrained_matrix=None, freeze_embeddings=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        if pretrained_matrix is not None:
            print("Loading pretrained embeddings...")
            self.embedding.weight.data.copy_(pretrained_matrix)
            self.embedding.weight.requires_grad = not freeze_embeddings
        else:
            print("Training embeddings from scratch.")
        self.convs = nn.ModuleList([nn.Conv1d(embedding_dim, num_filters, ks) for ks in kernel_sizes])
        self.lstm  = nn.LSTM(num_filters, hidden_dim, batch_first=True, bidirectional=True)
        self.attn  = nn.MultiheadAttention(embed_dim=hidden_dim*2, num_heads=num_heads, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc      = nn.Linear(hidden_dim*2, output_dim)
        self.pad_idx = pad_idx

    def forward(self, text):
        mask = (text == self.pad_idx)
        emb  = self.dropout(self.embedding(text))
        x    = emb.permute(0, 2, 1)
        c    = torch.relu(self.convs[0](x)).permute(0,2,1)
        out, _ = self.lstm(self.dropout(c))
        if out.size(1) != mask.size(1):
            diff = out.size(1) - mask.size(1)
            if diff > 0:
                mask = torch.cat([mask, mask.new_ones(mask.size(0), diff)], dim=1)
            else:
                mask = mask[:, :out.size(1)]
        attn_out, _ = self.attn(out, out, out, key_padding_mask=mask)
        attn_out    = attn_out.masked_fill(mask.unsqueeze(-1), 0.0)
        summed      = attn_out.sum(1)
        cnt_nonpad  = (~mask).sum(1).clamp(min=1).unsqueeze(1)
        pooled      = summed / cnt_nonpad
        return self.fc(self.dropout(pooled))

def train_model(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    for Xb, yb in tqdm(loader, desc="Training batches"):
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(Xb)
        loss   = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = logits.argmax(dim=1).detach().cpu().numpy()
        all_preds.append(preds)
        all_labels.append(yb.cpu().numpy())
    avg_loss = total_loss / len(loader)
    y_pred   = np.concatenate(all_preds)
    y_true   = np.concatenate(all_labels)
    acc      = accuracy_score(y_true, y_pred)
    f1       = f1_score(y_true, y_pred, average='macro', zero_division=0)
    return avg_loss, acc, f1

def evaluate_model(model, loader, criterion, idx_to_label):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    for Xb, yb in tqdm(loader, desc="Evaluation batches"):
        with torch.no_grad():
            Xb, yb = Xb.to(device), yb.to(device)
            logits = model(Xb)
            total_loss += criterion(logits, yb).item()
            preds = logits.argmax(dim=1).cpu().numpy()
            all_preds.append(preds)
            all_labels.append(yb.cpu().numpy())
    avg_loss = total_loss / len(loader)
    y_pred   = np.concatenate(all_preds)
    y_true   = np.concatenate(all_labels)
    acc      = accuracy_score(y_true, y_pred)
    f1       = f1_score(y_true, y_pred, average='macro', zero_division=0)
    print("\n--- Classification Report ---")
    print(classification_report(
        y_true, y_pred,
        target_names=[idx_to_label[i] for i in range(len(idx_to_label))],
        zero_division=0, digits=4
    ))
    # print(confusion_matrix(y_true, y_pred))
    return avg_loss, acc, f1

def save_data_artifacts(vocab, label_map, idx_to_label, cv, tv, emb_mat, X, Y, kv, artifacts_dir=ARTIFACTS_DIR):
    os.makedirs(artifacts_dir, exist_ok=True)
    print("Saving data artifacts...")
    joblib.dump(vocab, os.path.join(artifacts_dir, 'vocab.joblib'))
    joblib.dump(label_map, os.path.join(artifacts_dir, 'label_map.joblib'))
    joblib.dump(idx_to_label, os.path.join(artifacts_dir, 'idx_to_label.joblib'))
    joblib.dump(cv, os.path.join(artifacts_dir, 'count_vectorizer.joblib'))
    joblib.dump(tv, os.path.join(artifacts_dir, 'tfidf_vectorizer.joblib'))
    joblib.dump(emb_mat, os.path.join(artifacts_dir, 'embedding_matrix.joblib'))
    joblib.dump((X, Y), os.path.join(artifacts_dir, 'dataset_XY.joblib'))
    kv.save(os.path.join(artifacts_dir, 'word2vec_avg.kv'))
    print(f"✅ Data artifacts saved to {artifacts_dir}/")

def load_data_artifacts(artifacts_dir=ARTIFACTS_DIR):
    print("Loading data artifacts...")
    vocab        = joblib.load(os.path.join(artifacts_dir, 'vocab.joblib'))
    label_map    = joblib.load(os.path.join(artifacts_dir, 'label_map.joblib'))
    idx_to_label = joblib.load(os.path.join(artifacts_dir, 'idx_to_label.joblib'))
    cv           = joblib.load(os.path.join(artifacts_dir, 'count_vectorizer.joblib'))
    tv           = joblib.load(os.path.join(artifacts_dir, 'tfidf_vectorizer.joblib'))
    emb_mat      = joblib.load(os.path.join(artifacts_dir, 'embedding_matrix.joblib'))
    X, Y         = joblib.load(os.path.join(artifacts_dir, 'dataset_XY.joblib'))
    kv           = KeyedVectors.load(os.path.join(artifacts_dir, 'word2vec_avg.kv'), mmap='r')
    print(f"✅ Data artifacts loaded from {artifacts_dir}/")
    return X, Y, vocab, label_map, idx_to_label, cv, tv, emb_mat, kv

data_file = os.path.join(ARTIFACTS_DIR, 'dataset_XY.joblib')
if os.path.exists(data_file):
    X, Y, vocab, label_map, idx_to_label, cv, tv, emb_mat, kv = load_data_artifacts()
else:
    X, Y, toks, vocab, label_map, idx_to_label, cv, tv = preprocess_data(
        "/kaggle/input/phishing-email-dataset/phishing_email.csv", MAX_LEN
    )
    kv      = train_word2vec_avg(toks, EMBEDDING_DIM, W2V_WINDOW, W2V_MIN_COUNT, runs=2)
    emb_mat = create_embedding_matrix(kv, vocab, EMBEDDING_DIM)
    save_data_artifacts(vocab, label_map, idx_to_label, cv, tv, emb_mat, X, Y, kv)

print("Preparing data loaders...")
dataset   = SentimentDataset(X, Y)
test_size = int(len(dataset) * TEST_SPLIT_RATIO)
train_size= len(dataset) - test_size
train_ds, test_ds = (
    random_split(dataset, [train_size, test_size]) if test_size>0 else (dataset, None)
)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE) if test_ds else None

print(emb_mat)
model = CNNBiLSTM_MHA(
    vocab_size=len(vocab), embedding_dim=EMBEDDING_DIM,
    num_filters=NUM_FILTERS, kernel_sizes=KERNEL_SIZES,
    hidden_dim=HIDDEN_DIM, num_heads=NUM_HEADS,
    output_dim=len(label_map), dropout=DROPOUT,
    pad_idx=vocab[PAD_TOKEN], pretrained_matrix=emb_mat,
    freeze_embeddings=FREEZE_EMBEDDINGS
).to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

best_f1 = 0.0
epochs_no_improve = 0
print("Starting training...")
for epoch in tqdm(range(NUM_EPOCHS), desc="Epochs"):
    tr_loss, tr_acc, tr_f1 = train_model(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} — loss: {tr_loss:.4f} — acc: {tr_acc:.4f} — f1: {tr_f1:.4f}")
    if test_loader:
        val_loss, val_acc, val_f1 = evaluate_model(model, test_loader, criterion, idx_to_label)
        print(f"Validation — loss: {val_loss:.4f} — acc: {val_acc:.4f} — f1: {val_f1:.4f}\n")
        if val_f1 > best_f1:
            best_f1 = val_f1
            epochs_no_improve = 0
            torch.save(model.state_dict(), os.path.join(ARTIFACTS_DIR, BEST_MODEL_FILENAME))
            print(f"✅ Model state saved to {BEST_MODEL_FILENAME}")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"Early stopping at epoch {epoch+1}.")
                break

if test_loader:
    all_preds = []
    all_labels = []
    model.eval()
    with torch.no_grad():
        for texts, labels in test_loader:
            logits = model(texts.to(device))
            preds = logits.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    conf_matrix = confusion_matrix(all_labels, all_preds)
    print(classification_report(all_labels, all_preds, digits=4))
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                 xticklabels=list(label_map.keys()), yticklabels=list(label_map.keys()))
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

sample = "Congratulations! You've won a free ticket. Click here..."
toks   = sample.lower().split()
nums   = [vocab.get(t, vocab[UNK_TOKEN]) for t in toks]
nums   = nums[:MAX_LEN] + [vocab[PAD_TOKEN]]*(MAX_LEN-len(nums))
model.eval()
with torch.no_grad():
    logits = model(torch.tensor([nums], dtype=torch.long).to(device))
    pred   = logits.argmax(dim=1).item()
print(f"Sample prediction: {idx_to_label[pred]}")

In [None]:
!pip install "modin[all]" 
!pip install swifter
!pip install swifter[notebook]
!pip install swifter[groupby]

In [None]:
import os
import re
import unicodedata
import gc
import json
import random
import joblib

from joblib import Parallel, delayed
from collections import Counter

import modin.pandas as pd
import swifter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from tqdm.auto import tqdm
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

EMBEDDING_DIM     = 200
HIDDEN_DIM        = 128
NUM_HEADS         = 4
NUM_FILTERS       = 200
KERNEL_SIZES      = [2]
DROPOUT           = 0.2
LR                = 1e-4
BATCH_SIZE        = 8
NUM_EPOCHS        = 30
MAX_LEN           = 256
PAD_TOKEN         = "<pad>"
UNK_TOKEN         = "<unk>"
TEST_SPLIT_RATIO  = 0.2
FREEZE_EMBEDDINGS = False
PATIENCE          = 5

BEST_MODEL_FILENAME = 'best_email_classifier.pt'
ARTIFACTS_DIR       = 'artifacts_lstm_cnn_bilingual'
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

W2V_WINDOW    = 15
W2V_MIN_COUNT = 10
W2V_WORKERS   = os.cpu_count()
W2V_RUNS      = 1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

def normalize_text(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize('NFKC', text).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    return re.sub(r'\s+', ' ', text).strip()

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')).union({
    'subject','re','fw','fwd','sent','from','to','cc','bcc',
    'http','https','www','com','org','net','am','pm',
    'dear','hello','hi','thanks','best','regards',
    'please','mail','email','message','list','address','nbsp'
})

def preprocess_data(filepath, max_len):
    print("Start preprocessing data...")
    df = pd.read_excel(filepath)
    df = df.dropna(subset=['translation']).reset_index(drop=True)
    gc.collect()

    def tokenize_and_lemmatize(text):
        toks = word_tokenize(text)
        return toks

    print(df['translation'])
    texts = df['translation'].tolist()
    sentences = Parallel(n_jobs=W2V_WORKERS, backend="threading")(  
        delayed(tokenize_and_lemmatize)(text) for text in tqdm(texts, desc="Tokenizing texts", total=len(texts))
    )
    toks_for_w2v = sentences
    raw_labels = df['label'].astype(str).str.strip().tolist()
    print("Preprocessing completed.")

    all_words = Counter([w for sent in toks_for_w2v for w in sent])
    vocab = {w: i+2 for i, w in enumerate(all_words.keys())}
    vocab[PAD_TOKEN] = 0
    vocab[UNK_TOKEN] = 1
    unique_labels = sorted(set(raw_labels))
    label_map    = {lbl: i for i, lbl in enumerate(unique_labels)}
    idx_to_label = {i: lbl for lbl, i in label_map.items()}

    X, Y = [], []
    for toks, lbl in zip(sentences, raw_labels):
        nums = [vocab.get(t, vocab[UNK_TOKEN]) for t in toks]
        if len(nums) < max_len:
            nums += [vocab[PAD_TOKEN]] * (max_len - len(nums))
        else:
            nums = nums[:max_len]
        X.append(nums)
        Y.append(label_map[lbl])
    return X, Y, toks_for_w2v, vocab, label_map, idx_to_label

def train_word2vec_avg(tokenized_sentences, embedding_dim, window, min_count, runs=1):
    print("\nStart Word2Vec training...")
    tokenized_sentences = list(tokenized_sentences)
    sum_vec, keys = None, None
    cpu_cores = os.cpu_count()
    for _ in tqdm(range(runs), desc="Word2Vec runs"):
        model = Word2Vec(
            sentences=tokenized_sentences,
            vector_size=embedding_dim,
            window=window,
            min_count=min_count,
            workers=cpu_cores,
            sg=1,
            epochs=5,
            batch_words=10000
        )
        vecs = model.wv.vectors
        if sum_vec is None:
            sum_vec = vecs.astype(np.float32)
            keys = model.wv.index_to_key
        else:
            sum_vec += vecs
    avg_vec = sum_vec / runs
    kv = KeyedVectors(vector_size=embedding_dim)
    kv.add_vectors(keys, avg_vec)
    print("Word2Vec training completed.")
    return kv

def create_embedding_matrix(kv, vocab, embedding_dim):
    print("Creating embedding matrix...")
    mat = np.zeros((len(vocab), embedding_dim))
    cnt = 0
    for w, idx in vocab.items():
        if w in kv:
            mat[idx] = kv[w]
            cnt += 1
    print(f"Initialized {cnt}/{len(vocab)} tokens.")
    return torch.tensor(mat, dtype=torch.float)

class SentimentDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.Y[idx], dtype=torch.long)

class CNNBiLSTM_MHA(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, kernel_sizes,
                 hidden_dim, num_heads, output_dim, dropout, pad_idx,
                 pretrained_matrix=None, freeze_embeddings=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        if pretrained_matrix is not None:
            print("Loading pretrained embeddings...")
            self.embedding.weight.data.copy_(pretrained_matrix)
            self.embedding.weight.requires_grad = not freeze_embeddings
        else:
            print("Training embeddings from scratch.")
        self.convs = nn.ModuleList([nn.Conv1d(embedding_dim, num_filters, ks) for ks in kernel_sizes])
        self.lstm  = nn.LSTM(num_filters, hidden_dim, batch_first=True, bidirectional=True)
        self.attn  = nn.MultiheadAttention(embed_dim=hidden_dim*2, num_heads=num_heads, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc      = nn.Linear(hidden_dim*2, output_dim)
        self.pad_idx = pad_idx

    def forward(self, text):
        mask = (text == self.pad_idx)
        emb  = self.dropout(self.embedding(text))
        x    = emb.permute(0, 2, 1)
        c    = torch.relu(self.convs[0](x)).permute(0,2,1)
        out, _ = self.lstm(self.dropout(c))
        if out.size(1) != mask.size(1):
            diff = out.size(1) - mask.size(1)
            if diff > 0:
                mask = torch.cat([mask, mask.new_ones(mask.size(0), diff)], dim=1)
            else:
                mask = mask[:, :out.size(1)]
        attn_out, _ = self.attn(out, out, out, key_padding_mask=mask)
        attn_out    = attn_out.masked_fill(mask.unsqueeze(-1), 0.0)
        summed      = attn_out.sum(1)
        cnt_nonpad  = (~mask).sum(1).clamp(min=1).unsqueeze(1)
        pooled      = summed / cnt_nonpad
        return self.fc(self.dropout(pooled))

def train_model(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    for Xb, yb in tqdm(loader, desc="Training batches"):
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(Xb)
        loss   = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = logits.argmax(dim=1).detach().cpu().numpy()
        all_preds.append(preds)
        all_labels.append(yb.cpu().numpy())
    avg_loss = total_loss / len(loader)
    y_pred   = np.concatenate(all_preds)
    y_true   = np.concatenate(all_labels)
    acc      = accuracy_score(y_true, y_pred)
    f1       = f1_score(y_true, y_pred, average='macro', zero_division=0)
    return avg_loss, acc, f1

def evaluate_model(model, loader, criterion, idx_to_label):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    for Xb, yb in tqdm(loader, desc="Evaluation batches"):
        with torch.no_grad():
            Xb, yb = Xb.to(device), yb.to(device)
            logits = model(Xb)
            total_loss += criterion(logits, yb).item()
            preds = logits.argmax(dim=1).cpu().numpy()
            all_preds.append(preds)
            all_labels.append(yb.cpu().numpy())
    avg_loss = total_loss / len(loader)
    y_pred   = np.concatenate(all_preds)
    y_true   = np.concatenate(all_labels)
    acc      = accuracy_score(y_true, y_pred)
    f1       = f1_score(y_true, y_pred, average='macro', zero_division=0)
    print("\n--- Classification Report ---")
    print(classification_report(
        y_true, y_pred,
        target_names=[idx_to_label[i] for i in range(len(idx_to_label))],
        zero_division=0, digits=4
    ))
    # print(confusion_matrix(y_true, y_pred))
    return avg_loss, acc, f1

def save_data_artifacts(vocab, label_map, idx_to_label, emb_mat, X, Y, kv, artifacts_dir=ARTIFACTS_DIR):
    os.makedirs(artifacts_dir, exist_ok=True)
    print("Saving data artifacts...")
    joblib.dump(vocab, os.path.join(artifacts_dir, 'vocab.joblib'))
    joblib.dump(label_map, os.path.join(artifacts_dir, 'label_map.joblib'))
    joblib.dump(idx_to_label, os.path.join(artifacts_dir, 'idx_to_label.joblib'))
    # joblib.dump(cv, os.path.join(artifacts_dir, 'count_vectorizer.joblib'))
    # joblib.dump(tv, os.path.join(artifacts_dir, 'tfidf_vectorizer.joblib'))
    joblib.dump(emb_mat, os.path.join(artifacts_dir, 'embedding_matrix.joblib'))
    joblib.dump((X, Y), os.path.join(artifacts_dir, 'dataset_XY.joblib'))
    kv.save(os.path.join(artifacts_dir, 'word2vec_avg.kv'))
    print(f"✅ Data artifacts saved to {artifacts_dir}/")

def load_data_artifacts(artifacts_dir=ARTIFACTS_DIR):
    print("Loading data artifacts...")
    vocab        = joblib.load(os.path.join(artifacts_dir, 'vocab.joblib'))
    label_map    = joblib.load(os.path.join(artifacts_dir, 'label_map.joblib'))
    idx_to_label = joblib.load(os.path.join(artifacts_dir, 'idx_to_label.joblib'))
    # cv           = joblib.load(os.path.join(artifacts_dir, 'count_vectorizer.joblib'))
    # tv           = joblib.load(os.path.join(artifacts_dir, 'tfidf_vectorizer.joblib'))
    emb_mat      = joblib.load(os.path.join(artifacts_dir, 'embedding_matrix.joblib'))
    X, Y         = joblib.load(os.path.join(artifacts_dir, 'dataset_XY.joblib'))
    kv           = KeyedVectors.load(os.path.join(artifacts_dir, 'word2vec_avg.kv'), mmap='r')
    print(f"✅ Data artifacts loaded from {artifacts_dir}/")
    return X, Y, vocab, label_map, idx_to_label, emb_mat, kv

data_file = os.path.join(ARTIFACTS_DIR, 'dataset_XY.joblib')
if os.path.exists(data_file):
    X, Y, vocab, label_map, idx_to_label, emb_mat, kv = load_data_artifacts()
else:
    X, Y, toks, vocab, label_map, idx_to_label, = preprocess_data(
        "/kaggle/input/phishing-email-vietnamese-v2-excel/phishing_email_with_translation_clean_small.xlsx", MAX_LEN
    )
    kv      = train_word2vec_avg(toks, EMBEDDING_DIM, W2V_WINDOW, W2V_MIN_COUNT, runs=2)
    emb_mat = create_embedding_matrix(kv, vocab, EMBEDDING_DIM)
    save_data_artifacts(vocab, label_map, idx_to_label, emb_mat, X, Y, kv)

print("Preparing data loaders...")
dataset   = SentimentDataset(X, Y)
test_size = int(len(dataset) * TEST_SPLIT_RATIO)
train_size= len(dataset) - test_size
train_ds, test_ds = (
    random_split(dataset, [train_size, test_size]) if test_size>0 else (dataset, None)
)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE) if test_ds else None

print(emb_mat)
model = CNNBiLSTM_MHA(
    vocab_size=len(vocab), embedding_dim=EMBEDDING_DIM,
    num_filters=NUM_FILTERS, kernel_sizes=KERNEL_SIZES,
    hidden_dim=HIDDEN_DIM, num_heads=NUM_HEADS,
    output_dim=len(label_map), dropout=DROPOUT,
    pad_idx=vocab[PAD_TOKEN], pretrained_matrix=emb_mat,# Final classification report and confusion matrix
    
print("\n--- Final Evaluation on Test Set ---")
print(classification_report(all_labels, all_preds, target_names=label_map.keys(), digits=4))
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_map.keys(), yticklabels=label_map.keys())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
    freeze_embeddings=FREEZE_EMBEDDINGS
).to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

best_f1 = 0.0
epochs_no_improve = 0
print("Starting training...")
for epoch in tqdm(range(NUM_EPOCHS), desc="Epochs"):
    tr_loss, tr_acc, tr_f1 = train_model(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} — loss: {tr_loss:.4f} — acc: {tr_acc:.4f} — f1: {tr_f1:.4f}")
    if test_loader:
        val_loss, val_acc, val_f1 = evaluate_model(model, test_loader, criterion, idx_to_label)
        print(f"Validation — loss: {val_loss:.4f} — acc: {val_acc:.4f} — f1: {val_f1:.4f}\n")
        if val_f1 > best_f1:
            best_f1 = val_f1
            epochs_no_improve = 0
            torch.save(model.state_dict(), os.path.join(ARTIFACTS_DIR, BEST_MODEL_FILENAME))
            print(f"✅ Model state saved to {BEST_MODEL_FILENAME}")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"Early stopping at epoch {epoch+1}.")
                break

if test_loader:
    all_preds = []
    all_labels = []
    model.eval()
    with torch.no_grad():
        for texts, labels in test_loader:
            logits = model(texts.to(device))
            preds = logits.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    conf_matrix = confusion_matrix(all_labels, all_preds)
    print(classification_report(all_labels, all_preds, digits=4))
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                 xticklabels=list(label_map.keys()), yticklabels=list(label_map.keys()))
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

sample = "Congratulations! You've won a free ticket. Click here..."
toks   = sample.lower().split()
nums   = [vocab.get(t, vocab[UNK_TOKEN]) for t in toks]
nums   = nums[:MAX_LEN] + [vocab[PAD_TOKEN]]*(MAX_LEN-len(nums))
model.eval()
with torch.no_grad():
    logits = model(torch.tensor([nums], dtype=torch.long).to(device))
    pred   = logits.argmax(dim=1).item()
print(f"Sample prediction: {idx_to_label[pred]}")

In [None]:
import os
import gc
import json
import random

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from torch.optim import AdamW

# ==================== Configuration ====================
MODEL_NAME        = 'vinai/phobert-base-v2'   # or your preferred BERT model
MAX_LEN           = 256
BATCH_SIZE        = 16
NUM_EPOCHS        = 5
LR                = 2e-5
WARMUP_STEPS      = 0
WEIGHT_DECAY      = 0.01
TEST_SPLIT_RATIO  = 0.2
PATIENCE          = 3
FREEZE_BERT       = False
DEVICE            = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ARTIFACTS_DIR     = 'artifacts_albert'
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts    = texts
        self.labels   = labels
        self.tokenizer= tokenizer
        self.max_len  = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label= self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        item = {
            'input_ids':      encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels':         torch.tensor(label, dtype=torch.long)
        }
        return item

class BertClassifier(nn.Module):
    def __init__(self, model_name, num_labels, freeze_bert=False):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Use [CLS] token representation
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        return self.classifier(x)

# Data loading & preprocessing

def load_data(filepath):
    # df = pd.read_csv(filepath)
    df = pd.read_excel(filepath)
    df = df.dropna(subset=['translation', 'label']).reset_index(drop=True)
    texts = df['translation'].tolist()
    raw_labels = df['label'].astype(str).str.strip().tolist()
    unique_labels = sorted(set(raw_labels))
    label_map = {lbl: i for i, lbl in enumerate(unique_labels)}
    labels = [label_map[l] for l in raw_labels]
    return texts, labels, label_map

# Prepare datasets and loaders
texts, labels, label_map = load_data('/kaggle/input/phishing-email-vietnamese-v2-excel/phishing_email_with_translation_clean_small.xlsx')
dataset = EmailDataset(texts, labels, tokenizer, MAX_LEN)
test_size = int(len(dataset) * TEST_SPLIT_RATIO)
train_size= len(dataset) - test_size
train_ds, test_ds = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE)

# Initialize model, optimizer, scheduler, loss
model = BertClassifier(MODEL_NAME, num_labels=len(label_map), freeze_bert=FREEZE_BERT).to(DEVICE)
optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)
criterion = nn.CrossEntropyLoss()

# Training & evaluation loops

best_f1 = 0
epochs_no_improve = 0
for epoch in range(NUM_EPOCHS):
    # Train
    model.train()
    train_loss = 0
    all_preds, all_labels = [], []
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())
    avg_train_loss = train_loss / len(train_loader)
    train_acc = accuracy_score(all_labels, all_preds)
    train_f1  = f1_score(all_labels, all_preds, average='macro', zero_division=0)
    print(f"Epoch {epoch+1}: train_loss={avg_train_loss:.4f}, acc={train_acc:.4f}, f1={train_f1:.4f}")

    # Evaluate
    model.eval()
    eval_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            eval_loss += loss.item()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    avg_eval_loss = eval_loss / len(test_loader)
    eval_acc = accuracy_score(all_labels, all_preds)
    eval_f1  = f1_score(all_labels, all_preds, average='macro', zero_division=0)
    print(f"Validation: loss={avg_eval_loss:.4f}, acc={eval_acc:.4f}, f1={eval_f1:.4f}")

    # Early stopping & save best
    if eval_f1 > best_f1:
        best_f1 = eval_f1
        epochs_no_improve = 0
        torch.save(model.state_dict(), os.path.join(ARTIFACTS_DIR, 'best_model.pt'))
        print("Saved best model.")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print("Early stopping...")
            break

# Final classification report and confusion matrix
print("\n--- Final Evaluation on Test Set ---")
print(classification_report(all_labels, all_preds, target_names=label_map.keys(), digits=4))
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_map.keys(), yticklabels=label_map.keys())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import os
import gc
import json
import random

import modin.pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from torch.optim import AdamW

MODEL_NAME = 'FacebookAI/xlm-roberta-base'
MAX_LEN = 256
BATCH_SIZE = 32
NUM_EPOCHS = 2
LR = 2e-5
WARMUP_STEPS = 0
WEIGHT_DECAY = 0.01
TEST_SPLIT_RATIO = 0.2
PATIENCE = 3
FREEZE_BERT = False
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ARTIFACTS_DIR = 'artifacts_xlm_roberta_bilingual'
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
        return item

import torch.nn.functional as F

class BertCNNLSTMClassifier(nn.Module):
    def __init__(
        self,
        model_name,
        num_labels,
        freeze_bert=False,
        cnn_filters=128,
        cnn_kernel_size=3,
        lstm_hidden_dim=128,
        lstm_layers=1,
        bidirectional=True,
        dropout=0.3
    ):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

        hidden_size = self.bert.config.hidden_size

        self.conv = nn.Conv1d(
            in_channels=hidden_size,
            out_channels=cnn_filters,
            kernel_size=cnn_kernel_size,
            padding=cnn_kernel_size // 2
        )

        self.lstm = nn.LSTM(
            input_size=cnn_filters,
            hidden_size=lstm_hidden_dim,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=bidirectional
        )

        lstm_output_dim = lstm_hidden_dim * (2 if bidirectional else 1)

        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(lstm_output_dim, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        seq_out = outputs.last_hidden_state

        x = seq_out.permute(0, 2, 1)
        x = F.relu(self.conv(x))
        x = x.permute(0, 2, 1)

        out, (h_n, _) = self.lstm(x)

        if self.lstm.bidirectional:
            h_forward = h_n[-2]
            h_backward = h_n[-1]
            h = torch.cat((h_forward, h_backward), dim=1)
        else:
            h = h_n[-1]

        h = self.dropout(h)
        logits = self.classifier(h)
        return logits

def load_data(filepath):
    df = pd.read_excel(filepath)
    df = df.dropna(subset=['text_combined', 'label']).reset_index(drop=True)
    texts = df['text_combined'].tolist()
    raw_labels = df['label'].astype(str).str.strip().tolist()
    unique_labels = sorted(set(raw_labels))
    label_map = {lbl: i for i, lbl in enumerate(unique_labels)}
    labels = [label_map[l] for l in raw_labels]
    return texts, labels, label_map

texts, labels, label_map = load_data('/kaggle/input/phishing-email-multilingual-v3/phishing_email_bilingual_with_label.xlsx')
dataset = EmailDataset(texts, labels, tokenizer, MAX_LEN)
test_size = int(len(dataset) * TEST_SPLIT_RATIO)
train_size = len(dataset) - test_size
train_ds, test_ds = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

model = BertCNNLSTMClassifier(
    model_name=MODEL_NAME,
    num_labels=len(label_map),
    freeze_bert=FREEZE_BERT,
    cnn_filters=128,
    cnn_kernel_size=3,
    lstm_hidden_dim=128,
    lstm_layers=1,
    bidirectional=True,
    dropout=0.3
).to(DEVICE)
optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)
criterion = nn.CrossEntropyLoss()

best_f1 = 0
epochs_no_improve = 0
for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0
    all_preds, all_labels = [], []
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())
    avg_train_loss = train_loss / len(train_loader)
    train_acc = accuracy_score(all_labels, all_preds)
    train_f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)
    print(f"Epoch {epoch+1}: train_loss={avg_train_loss:.4f}, acc={train_acc:.4f}, f1={train_f1:.4f}")

    model.eval()
    eval_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            eval_loss += loss.item()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    avg_eval_loss = eval_loss / len(test_loader)
    eval_acc = accuracy_score(all_labels, all_preds)
    eval_f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)
    print(f"Validation: loss={avg_eval_loss:.4f}, acc={eval_acc:.4f}, f1={eval_f1:.4f}")

    if eval_f1 > best_f1:
        best_f1 = eval_f1
        epochs_no_improve = 0
        torch.save(model.state_dict(), os.path.join(ARTIFACTS_DIR, 'best_model.pt'))
        print("Saved best model.")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print("Early stopping...")
            break

loaded_model = BertCNNLSTMClassifier(
    model_name=MODEL_NAME,
    num_labels=len(label_map),
    freeze_bert=FREEZE_BERT
).to(DEVICE)
loaded_model.load_state_dict(torch.load(os.path.join(ARTIFACTS_DIR, 'best_model.pt')))
loaded_model.eval()

print("\n--- Evaluation of Loaded Best Model on Test Set ---")
all_preds_loaded, all_labels_loaded = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating Loaded Model"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels_batch = batch['labels'].to(DEVICE)
        logits = loaded_model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds_loaded.extend(preds)
        all_labels_loaded.extend(labels_batch.cpu().numpy())

print("\n--- Final Evaluation on Test Set ---")
print(classification_report(all_labels, all_preds, target_names=label_map.keys(), digits=4))
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_map.keys(), yticklabels=label_map.keys())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

2025-05-07 10:40:25,942	INFO worker.py:1852 -- Started a local Ray instance.
Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.
2025-05-07 10:40:43.637337: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746614443.660843     868 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746614443.667842     868 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Training Epoch 1:   0%|          | 0/3488 [00:00<?, ?it/s]

In [None]:
import os
import zipfile

folder_path = '/kaggle/working/artifacts'
zip_name = 'email_cnn_bilstm_attention.zip'

with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            # arcname makes the zip structure relative
            arcname = os.path.relpath(file_path, start=folder_path)
            zipf.write(file_path, arcname=arcname)

In [None]:
import shutil

shutil.make_archive('email_cnn_bilstm_attention_1', 'zip', '/kaggle/working/artifacts')
from IPython.display import FileLink
FileLink(r'email_cnn_bilstm_attention_1.zip')

In [None]:
123123123 123123 123123213 1123123

In [None]:
!rm -rf /kaggle/working/artifacts 123123123