In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import faiss
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 2. ƒê·ªçc b·ªô d·ªØ li·ªáu
DATASET_PATH = "2cls_spam_text_cls.csv"
df = pd.read_csv(DATASET_PATH)
# T√°ch tin nh·∫Øn v√† nh√£n v√†o c√°c list
messages = df["Message"].values.tolist()
labels = df["Category"].values.tolist()

In [3]:
# 3.1. Load m√¥ h√¨nh embedding
MODEL_NAME = "intfloat/multilingual-e5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# H√†m ƒë·ªÉ tr√≠ch xu·∫•t embedding t·ª´ output c·ªßa model
def average_pool(last_hidden_states, attention_mask):
    # Fix: Use ~attention_mask.bool() to mask padding tokens, not attention_mask.bool()
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [4]:
# 3.2. T·∫°o sentence embeddings
def get_embeddings(texts, model, tokenizer, device, batch_size=32):
    """T·∫°o embeddings cho m·ªôt danh s√°ch c√°c vƒÉn b·∫£n"""
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i+batch_size]
        batch_texts_with_prefix = [f"passage: {text}" for text in batch_texts]

        batch_dict = tokenizer(batch_texts_with_prefix, max_length=512, padding=True, truncation=True, return_tensors="pt")

        batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

        with torch.no_grad():
            outputs = model(**batch_dict)
            batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
            batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
            embeddings.append(batch_embeddings.cpu().numpy())
    return np.vstack(embeddings)

# Chu·∫©n b·ªã nh√£n
le = LabelEncoder()
y = le.fit_transform(labels)
# T·∫°o embeddings cho t·∫•t c·∫£ tin nh·∫Øn
X_embeddings = get_embeddings(messages, model, tokenizer, device)
# T·∫°o metadata cho m·ªói t√†i li·ªáu
metadata = [{"index": i, "message": message, "label": label, "label_encoded": y[i]}
             for i, (message, label) in enumerate(zip(messages, labels))]

  return forward_call(*args, **kwargs)
Generating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 175/175 [04:31<00:00,  1.55s/it]


In [5]:
# 3.3. T·∫°o FAISS index v√† chia d·ªØ li·ªáu
TEST_SIZE = 0.1
SEED = 42
train_indices, test_indices = train_test_split(range(len(messages)), test_size=TEST_SIZE, stratify=y, random_state=SEED)

# T√°ch embeddings v√† metadata theo ch·ªâ s·ªë ƒë√£ chia
X_train_emb = X_embeddings[train_indices]
X_test_emb = X_embeddings[test_indices]

train_metadata = [metadata[i] for i in train_indices]
test_metadata = [metadata[i] for i in test_indices]

# T·∫°o FAISS index
embedding_dim = X_train_emb.shape[1]

index = faiss.IndexFlatIP(embedding_dim)
index.add(X_train_emb.astype("float32"))

In [6]:
# 4. Tri·ªÉn khai ph√¢n lo·∫°i v·ªõi embedding similarity

def evaluate_knn_accuracy(test_embeddings,test_metadata,index, train_metadata,k_values=[1,3,5]):
    """Evaluate k-NN accuracy for different k values"""
    results={}
    all_errors={}

    for k in k_values:
        correct=0
        total=len(test_embeddings)
        errors=[]

        for i in tqdm(range(total),desc=f"Evaluatingk={k}"):
            query_embedding=test_embeddings[i:i+1].astype("float32")
            true_label=test_metadata[i]["label"]
            true_message=test_metadata[i]["message"]

            #SearchinFAISSindex
            scores,indices=index.search(query_embedding,k)  

            #Getpredictionsfrom top-kneighbors
            predictions=[]
            neighbor_details=[]
            for j in range(k):
                neighbor_idx=indices[0][j]
                neighbor_label= train_metadata[neighbor_idx]["label"]
                neighbor_message = train_metadata[neighbor_idx]["message"]
                neighbor_score=float(scores[0][j])  
                predictions.append(neighbor_label)
                neighbor_details.append({
                    "label":neighbor_label,
                    "message":neighbor_message,
                    "score":neighbor_score
                })
            
            # Majority vote
            unique_labels, counts = np.unique(predictions, return_counts=True)
            predicted_label = unique_labels[np.argmax(counts)]
            if predicted_label == true_label:
                correct += 1
            else:
                # Collect error information
                error_info = {
                    "index": i,
                    "original_index": test_metadata[i]["index"],
                    "message": true_message,
                    "true_label": true_label,
                    "predicted_label": predicted_label,
                    "neighbors": neighbor_details,
                }
                errors.append(error_info)

        accuracy = correct / total
        error_count = total- correct
        
        results[k] = accuracy
        all_errors[k] = errors

        print(f"Accuracy with k={k}: {accuracy:.4f}")
        print(f"Number of errors with k={k}: {error_count}/{total} ({(error_count/total
        )*100:.2f}%)")

    return results, all_errors

In [7]:
# 5. ƒê√°nh gi√° accuracy tr√™n test set

print("Evaluating accuracy on test set...")
accuracy_results, error_results = evaluate_knn_accuracy(X_test_emb, test_metadata, index, train_metadata, k_values=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Hi·ªÉn th·ªã k·∫øt qu·∫£
print("\n" + "="*50)
print("ACCURACY RESULTS")
print("="*50)
for k, accuracy in accuracy_results.items():
    print(f"Top-{k} accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

print("="*50)

# L∆∞u ph√¢n t√≠ch l·ªói ra file

from datetime import datetime
import json

# Prepare error analysis dictionary
error_analysis = {
    "timestamp": datetime.now().isoformat(),
    "model": MODEL_NAME,
    "test_size": len(X_test_emb),
    "accuracy_results": accuracy_results,
    "errors_by_k": {}
}

# Populate errors by value of k
for k, errors in error_results.items():
    error_analysis["errors_by_k"][f"k_{k}"] = {
        "total_errors": len(errors),
        "errors": errors,
        "error_rate": len(errors) / len(X_test_emb)
    }

# Save error analysis to JSON file
output_file = "error_analysis.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(error_analysis, f, ensure_ascii=False, indent=2)

print(f"\n*** Error analysis saved to: {output_file} ***\n")
print("*** Summary ***")
for k, errors in error_results.items():
    print(f"k={k}: {len(errors)} errors out of {len(X_test_emb)} samples")


Evaluating accuracy on test set...


Evaluatingk=1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 558/558 [00:00<00:00, 1646.29it/s]


Accuracy with k=1: 0.9857
Number of errors with k=1: 8/558 (1.43%)


Evaluatingk=2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 558/558 [00:00<00:00, 2032.69it/s]


Accuracy with k=2: 0.9875
Number of errors with k=2: 7/558 (1.25%)


Evaluatingk=3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 558/558 [00:00<00:00, 1984.14it/s]


Accuracy with k=3: 0.9928
Number of errors with k=3: 4/558 (0.72%)


Evaluatingk=4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 558/558 [00:00<00:00, 1903.02it/s]


Accuracy with k=4: 0.9892
Number of errors with k=4: 6/558 (1.08%)


Evaluatingk=5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 558/558 [00:00<00:00, 1942.30it/s]


Accuracy with k=5: 0.9910
Number of errors with k=5: 5/558 (0.90%)


Evaluatingk=6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 558/558 [00:00<00:00, 1967.08it/s]


Accuracy with k=6: 0.9892
Number of errors with k=6: 6/558 (1.08%)


Evaluatingk=7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 558/558 [00:00<00:00, 1839.37it/s]


Accuracy with k=7: 0.9892
Number of errors with k=7: 6/558 (1.08%)


Evaluatingk=8: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 558/558 [00:00<00:00, 1959.78it/s]


Accuracy with k=8: 0.9875
Number of errors with k=8: 7/558 (1.25%)


Evaluatingk=9: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 558/558 [00:00<00:00, 1974.76it/s]


Accuracy with k=9: 0.9875
Number of errors with k=9: 7/558 (1.25%)


Evaluatingk=10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 558/558 [00:00<00:00, 1935.61it/s]

Accuracy with k=10: 0.9892
Number of errors with k=10: 6/558 (1.08%)

ACCURACY RESULTS
Top-1 accuracy: 0.9857 (98.57%)
Top-2 accuracy: 0.9875 (98.75%)
Top-3 accuracy: 0.9928 (99.28%)
Top-4 accuracy: 0.9892 (98.92%)
Top-5 accuracy: 0.9910 (99.10%)
Top-6 accuracy: 0.9892 (98.92%)
Top-7 accuracy: 0.9892 (98.92%)
Top-8 accuracy: 0.9875 (98.75%)
Top-9 accuracy: 0.9875 (98.75%)
Top-10 accuracy: 0.9892 (98.92%)

*** Error analysis saved to: error_analysis.json ***

*** Summary ***
k=1: 8 errors out of 558 samples
k=2: 7 errors out of 558 samples
k=3: 4 errors out of 558 samples
k=4: 6 errors out of 558 samples
k=5: 5 errors out of 558 samples
k=6: 6 errors out of 558 samples
k=7: 6 errors out of 558 samples
k=8: 7 errors out of 558 samples
k=9: 7 errors out of 558 samples
k=10: 6 errors out of 558 samples





In [8]:
# Save model components for reuse
import pickle
import os

# Create models directory if it doesn't exist
models_dir = "models"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print(f"‚úì Created directory: {models_dir}")

print("Saving model components...")

# Save the FAISS index
faiss_index_path = os.path.join(models_dir, "faiss_index.bin")
faiss.write_index(index, faiss_index_path)
print(f"‚úì Saved {faiss_index_path}")

# Save train metadata
train_metadata_path = os.path.join(models_dir, "train_metadata.pkl")
with open(train_metadata_path, 'wb') as f:
    pickle.dump(train_metadata, f)
print(f"‚úì Saved {train_metadata_path}")

# Save label encoder with DL-specific name to avoid conflicts
dl_label_encoder_path = os.path.join(models_dir, "dl_label_encoder.pkl")
with open(dl_label_encoder_path, 'wb') as f:
    pickle.dump(le, f)
print(f"‚úì Saved {dl_label_encoder_path}")

# Save model configuration info
model_config = {
    "model_name": MODEL_NAME,
    "embedding_dim": embedding_dim,
    "test_size": TEST_SIZE,
    "seed": SEED,
    "accuracy_results": accuracy_results
}

dl_model_config_path = os.path.join(models_dir, "dl_model_config.pkl")
with open(dl_model_config_path, 'wb') as f:
    pickle.dump(model_config, f)
print(f"‚úì Saved {dl_model_config_path}")

print(f"\nüéØ Summary of saved files in '{models_dir}' folder:")
print("="*60)
print("DL Model Components:")
print(f"- {models_dir}/faiss_index.bin (FAISS similarity search index)")
print(f"- {models_dir}/train_metadata.pkl (Training data metadata)")
print(f"- {models_dir}/dl_label_encoder.pkl (Label encoder for DL approach)")
print(f"- {models_dir}/dl_model_config.pkl (DL model configuration and performance)")

print(f"\nNote: The embedding model will be loaded from HuggingFace using model name: {MODEL_NAME}")
print(f"üèÜ Best k value: k={max(accuracy_results, key=accuracy_results.get)} with accuracy: {max(accuracy_results.values()):.4f}")

Saving model components...
‚úì Saved models\faiss_index.bin
‚úì Saved models\train_metadata.pkl
‚úì Saved models\dl_label_encoder.pkl
‚úì Saved models\dl_model_config.pkl

üéØ Summary of saved files in 'models' folder:
DL Model Components:
- models/faiss_index.bin (FAISS similarity search index)
- models/train_metadata.pkl (Training data metadata)
- models/dl_label_encoder.pkl (Label encoder for DL approach)
- models/dl_model_config.pkl (DL model configuration and performance)

Note: The embedding model will be loaded from HuggingFace using model name: intfloat/multilingual-e5-base
üèÜ Best k value: k=3 with accuracy: 0.9928


In [9]:
### Can run from here if you have the saved files

# Load saved model components (for inference without retraining)
# This cell can be used by others who receive your saved files

import pickle
import torch
import faiss
import os
from transformers import AutoTokenizer, AutoModel

def load_model_components():
    """Load all saved model components for inference"""
    print("Loading saved model components from models folder...")
    
    models_dir = "models"
    
    # Load configuration
    dl_model_config_path = os.path.join(models_dir, 'dl_model_config.pkl')
    with open(dl_model_config_path, 'rb') as f:
        config = pickle.load(f)
    
    # Load the embedding model and tokenizer from HuggingFace
    MODEL_NAME = config["model_name"]
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME)
    
    # Set device and prepare model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()
    
    # Load FAISS index
    faiss_index_path = os.path.join(models_dir, "faiss_index.bin")
    index = faiss.read_index(faiss_index_path)
    
    # Load metadata
    train_metadata_path = os.path.join(models_dir, 'train_metadata.pkl')
    with open(train_metadata_path, 'rb') as f:
        train_metadata = pickle.load(f)
    
    # Load label encoder (DL-specific filename)
    dl_label_encoder_path = os.path.join(models_dir, 'dl_label_encoder.pkl')
    with open(dl_label_encoder_path, 'rb') as f:
        le = pickle.load(f)
    
    print(f"Loaded model: {MODEL_NAME}")
    print(f"Training samples: {len(train_metadata)}")
    print(f"Device: {device}")
    if 'accuracy_results' in config:
        best_k = max(config['accuracy_results'], key=config['accuracy_results'].get)
        best_accuracy = config['accuracy_results'][best_k]
        print(f"Best performance: k={best_k} with accuracy={best_accuracy:.4f}")
    print("Ready for inference!")
    
    return model, tokenizer, device, index, train_metadata, le

# Uncomment the line below to load components (useful when starting fresh)
model, tokenizer, device, index, train_metadata, le = load_model_components()

Loading saved model components from models folder...
Loaded model: intfloat/multilingual-e5-base
Training samples: 5014
Device: cpu
Best performance: k=3 with accuracy=0.9928
Ready for inference!


In [10]:
def classify_with_knn(query_text, model, tokenizer, device, index, train_metadata, k=1):
    """Classify text using k-nearest neighbors with embeddings"""
    # Get query embedding
    query_with_prefix = f"query: {query_text}"
    batch_dict = tokenizer([query_with_prefix],
    max_length=512,
    padding=True,
    truncation=True,
    return_tensors="pt")
    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
    with torch.no_grad():
        outputs = model(**batch_dict)
        query_embedding = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
        query_embedding = F.normalize(query_embedding, p=2, dim=1)
        query_embedding = query_embedding.cpu().numpy().astype("float32")

    scores, indices = index.search(query_embedding, k)
    
    # Let's manually check if we can compute similarity with first training sample
    if len(train_metadata) > 0:
        # Get the first training embedding from FAISS
        first_train_embedding = index.reconstruct(0).reshape(1, -1)
        manual_similarity = np.dot(query_embedding, first_train_embedding.T)[0, 0]
    
    # Get predictions from top-k neighbors
    predictions = []
    neighbor_info = []

    for i in range(k):
        neighbor_idx=indices[0][i]
        neighbor_score=float(scores[0][i])  # Convert to Python float explicitly
        neighbor_label=train_metadata[neighbor_idx]["label"]
        neighbor_message=train_metadata[neighbor_idx]["message"]

        predictions.append(neighbor_label)
        neighbor_info.append({
           "score":neighbor_score,
           "label":neighbor_label,
           "message":neighbor_message[:100]+"..."if len(neighbor_message)>100 else neighbor_message })

    #Majority vote for final prediction
    unique_labels,counts=np.unique(predictions,return_counts=True)
    final_prediction=unique_labels[np.argmax(counts)]
    score = np.mean([info["score"] for info in neighbor_info])  # Average score of neighbors
    return final_prediction, neighbor_info, score

In [11]:
# 6. Pipeline classification for user input
def spam_classifier_pipeline(user_input, k=3):
    """
    Complete pipeline for spam classification.

    Args:
        user_input (str): Text to classify
        k (int): Number of nearest neighbors to consider

    Returns:
        dict: Classification results with details
    """
    print()
    print(f'*** Classifying: "{user_input}"')
    print(f"*** Using top-{k} nearest neighbors")
    print()

    # Get prediction and neighbors
    prediction, neighbors, score = classify_with_knn(
        user_input, model, tokenizer, device, index, train_metadata, k=k
    )

    # Display results
    print(f"*** Prediction: {prediction.upper()} | Score: {score:.6f}")
    print()
    print("*** Top neighbors:")
    for i, neighbor in enumerate(neighbors, 1):
        # Use more decimal places for score display
        print(f"\n{i}. Label: {neighbor['label']} | Score: {neighbor['score']:.6f}")
        print(f"   Message: {neighbor['message']}")

    # Count label distribution
    labels = [n["label"] for n in neighbors]
    label_counts = {label: labels.count(label) for label in set(labels)}

    return {
        "prediction": prediction,
        "score": score,
        "neighbors": neighbors,
        "label_distribution": label_counts
    }

In [12]:
# 7. Test pipeline with various examples
test_examples = [
    "I am actually thinking a way of doing something useful",
    "FREE!! Click here to win \$1000 NOW! Limited time offer!"
]

# Run the classifier on each test example
for i, example in enumerate(test_examples, 1):
    print(f"\n--- Example {i}: \"{example}\" ---")
    result = spam_classifier_pipeline(example, k=3) # k=3 is the best performing value from previous tests

# Interactive testing ‚Äì user can change text and k value
print("\n--- Interactive Testing ---")
user_text = "Win a free iPhone! Click here now!"
k_value = 5
result = spam_classifier_pipeline(user_text, k=k_value)



--- Example 1: "I am actually thinking a way of doing something useful" ---

*** Classifying: "I am actually thinking a way of doing something useful"
*** Using top-3 nearest neighbors

*** Prediction: HAM | Score: 0.839318

*** Top neighbors:

1. Label: ham | Score: 0.842366
   Message: yeah, that's what I was thinking

2. Label: ham | Score: 0.841213
   Message: that would be good ‚Ä¶ I'll phone you tomo lunchtime, shall I, to organise something?

3. Label: ham | Score: 0.834373
   Message: See? I thought it all through

--- Example 2: "FREE!! Click here to win \$1000 NOW! Limited time offer!" ---

*** Classifying: "FREE!! Click here to win \$1000 NOW! Limited time offer!"
*** Using top-3 nearest neighbors

*** Prediction: SPAM | Score: 0.851784

*** Top neighbors:

1. Label: spam | Score: 0.856560
   Message: Win a ¬£1000 cash prize or a prize worth ¬£5000

2. Label: spam | Score: 0.849934
   Message: FREE entry into our ¬£250 weekly competition just text the word WIN to 80086 NOW.

  "FREE!! Click here to win \$1000 NOW! Limited time offer!"
  return forward_call(*args, **kwargs)
