In [2]:
pip install numpy


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.12 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install numpy


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.12 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold

# -------------------------------
# 1. Data Loading Functions
# -------------------------------
def load_dataset(responses_file, labels_file):
    """
    Load dataset by reading responses and labels from JSONL files and merging them.
    """
    # Load responses into a dictionary mapping id -> response text
    responses = {}
    with open(responses_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            responses[data["id"]] = data["response"]
    
    # Load labels and merge with responses
    ids, texts, labels = [], [], []
    with open(labels_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            instance_id = data["id"]
            if instance_id in responses:
                ids.append(instance_id)
                texts.append(responses[instance_id])
                labels.append(data["label"])
    
    return ids, texts, labels

# -------------------------------
# 2. PyTorch Dataset
# -------------------------------
class AdDataset(Dataset):
    def __init__(self, ids, texts, labels):
        self.ids = ids
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.ids[idx], self.texts[idx], self.labels[idx]

# -------------------------------
# 3. The Classifier Model
# -------------------------------
class AdClassifier(nn.Module):
    def __init__(self, model_name, embedding_dim=384):
        super(AdClassifier, self).__init__()
        # Load pre-trained sentence transformer
        self.sbert = SentenceTransformer(model_name)
        # Linear layer for binary classification (outputs one logit)
        self.linear = nn.Linear(embedding_dim, 1)
    
    def forward(self, texts):
        # Encode texts into embeddings (batch_size x embedding_dim)
        embeddings = self.sbert.encode(texts, convert_to_tensor=True)
        logits = self.linear(embeddings)
        return logits

# -------------------------------
# 4. Training and Prediction Functions
# -------------------------------
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for _, texts, labels in dataloader:
        optimizer.zero_grad()
        logits = model(texts)
        # Convert labels to tensor and reshape to (batch_size, 1)
        labels_tensor = torch.tensor(labels, dtype=torch.float, device=device).unsqueeze(1)
        loss = criterion(logits, labels_tensor)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def predict_model(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for _, texts, labels in dataloader:
            logits = model(texts)
            probs = torch.sigmoid(logits)
            # Threshold at 0.5
            preds = (probs > 0.5).long().squeeze(1).cpu().numpy().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels)
    return all_preds, all_labels

# -------------------------------
# 5. Main Script: Training, Evaluation, and Submission
# -------------------------------
def main():
    # Update file paths as needed
    train_responses_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/Dataset/responses-train.jsonl'
    train_labels_file    = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/Dataset/responses-train-labels.jsonl'
    val_responses_file   = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/Dataset/responses-validation.jsonl'
    val_labels_file      = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/Dataset/responses-validation-labels.jsonl'
    test_responses_file  = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/Dataset/responses-test.jsonl'
    test_labels_file     = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/Dataset/responses-test-labels.jsonl'
    
    # Load train and validation datasets separately
    train_ids, train_texts, train_labels = load_dataset(train_responses_file, train_labels_file)
    val_ids, val_texts, val_labels = load_dataset(val_responses_file, val_labels_file)
    
    # Combine train and validation sets into one training set
    combined_ids = train_ids + val_ids
    combined_texts = train_texts + val_texts
    combined_labels = train_labels + val_labels
    
    # Load test dataset
    test_ids, test_texts, test_labels = load_dataset(test_responses_file, test_labels_file)
    
    # Create PyTorch datasets and dataloaders
    batch_size = 16
    train_dataset = AdDataset(combined_ids, combined_texts, combined_labels)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    test_dataset = AdDataset(test_ids, test_texts, test_labels)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Initialize the classifier using "all-MiniLM-L6-v2" (embedding dim=384)
    model = AdClassifier('all-MiniLM-L6-v2', embedding_dim=384)
    model.to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=1e-5)
    criterion = nn.BCEWithLogitsLoss()
    
    num_epochs = 2
    for epoch in range(num_epochs):
        loss = train_model(model, train_loader, optimizer, criterion, device)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}")
    
    # -------------------------
    # Evaluation on Combined Training Set
    # -------------------------
    train_preds, train_true = predict_model(model, train_loader, device)
    print("\nCombined Training Set Evaluation:")
    print(classification_report(train_true, train_preds))
    train_cm = confusion_matrix(train_true, train_preds)
    print("Confusion Matrix (Combined Training):")
    print(train_cm)
    print("F1 Score for ads (label 1):", f1_score(train_true, train_preds, pos_label=1))
    
    # Save training evaluation to CSV files
    df_train_report = pd.DataFrame(classification_report(train_true, train_preds, output_dict=True)).transpose()
    df_train_report.to_csv('/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/results copy/SentenceTransformer-baseline/train_classification_report.csv', index=True)
    df_train_cm = pd.DataFrame(train_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
    df_train_cm.to_csv('/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/results copy/SentenceTransformer-baseline/train_confusion_matrix.csv', index=True)
    
    # -------------------------
    # Evaluation on Test Set
    # -------------------------
    test_preds, test_true = predict_model(model, test_loader, device)
    print("\nTest Set Evaluation:")
    print(classification_report(test_true, test_preds))
    test_cm = confusion_matrix(test_true, test_preds)
    print("Confusion Matrix (Test):")
    print(test_cm)
    print("F1 Score for ads (label 1):", f1_score(test_true, test_preds, pos_label=1))
    
    # Save test evaluation to CSV files
    df_test_report = pd.DataFrame(classification_report(test_true, test_preds, output_dict=True)).transpose()
    df_test_report.to_csv('/path/to/save/test_classification_report.csv', index=True)
    df_test_cm = pd.DataFrame(test_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
    df_test_cm.to_csv('/path/to/save/test_confusion_matrix.csv', index=True)
    
    # Additional metrics based on confusion matrix
    TN, FP, FN, TP = test_cm.ravel()
    detection_accuracy = TP / (TP + FN) if (TP + FN) > 0 else 0
    false_negative_rate = FN / (TP + FN) if (TP + FN) > 0 else 0
    false_positive_rate = FP / (FP + TN) if (FP + TN) > 0 else 0
    print("Detection Accuracy for ads (label 1):", detection_accuracy)
    print("False Negative Rate for ads (label 1):", false_negative_rate)
    print("False Positive Rate for ads (label 1):", false_positive_rate)
    
    # Metrics for non-ads (label 0)
    detection_accuracy_non_ads = TN / (TN + FP) if (TN + FP) > 0 else 0
    false_negative_rate_non_ads = FP / (TN + FP) if (TN + FP) > 0 else 0
    false_positive_rate_non_ads = FN / (FN + TP) if (FN + TP) > 0 else 0
    print("Detection Accuracy for non-ads (label 0):", detection_accuracy_non_ads)
    print("False Negative Rate for non-ads (label 0):", false_negative_rate_non_ads)
    print("False Positive Rate for non-ads (label 0):", false_positive_rate_non_ads)
    print("F1-score for non-ads (label 0):", f1_score(test_true, test_preds, pos_label=0))
    
    # -------------------------
    # Submission File Generation
    # -------------------------
    submission_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/Submission/sentenceTransformer-baseline.jsonl'
    with open(submission_file, 'w', encoding='utf-8') as f_out:
        for instance_id, pred in zip(test_ids, test_preds):
            result = {
                "id": instance_id,
                "label": int(pred),
                "tag": "myGroupMyMethod"
            }
            f_out.write(json.dumps(result) + "\n")
            
    print(f"\nSubmission file saved to: {submission_file}")

if __name__ == "__main__":
    main()


  labels_tensor = torch.tensor(labels, dtype=torch.float, device=device).unsqueeze(1)


KeyboardInterrupt: 