In [1]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold

def load_dataset(responses_file, labels_file):
    """
    Load dataset by reading responses and labels from JSONL files and merging them.
    """
    # Load responses into a dictionary mapping id -> response text
    responses = {}
    with open(responses_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            # Optionally combine 'query' and 'response' if needed.
            responses[data["id"]] = data["response"]
    
    # Load labels and merge with responses
    ids, texts, labels = [], [], []
    with open(labels_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            instance_id = data["id"]
            if instance_id in responses:
                ids.append(instance_id)
                texts.append(responses[instance_id])
                labels.append(data["label"])
    
    return ids, texts, labels

# File paths (update these paths as needed)
train_responses_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train.jsonl'
train_labels_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train-labels.jsonl'
val_responses_file   = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation.jsonl'
val_labels_file      = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation-labels.jsonl'
test_responses_file  = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test.jsonl'
test_labels_file     = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test-labels.jsonl'

# Load datasets
train_ids, train_texts, train_labels = load_dataset(train_responses_file, train_labels_file)
val_ids, val_texts, val_labels = load_dataset(val_responses_file, val_labels_file)
test_ids, test_texts, test_labels = load_dataset(test_responses_file, test_labels_file)

# -----------------------------------------------------
# Experiment: K-Nearest Neighbors (KNN) with TF-IDF
# Classifier: KNeighborsClassifier
# Feature Extraction: TF-IDF
# Goal: Evaluate performance of KNN on the TF-IDF representation of the responses.
# -----------------------------------------------------

# Build the pipeline: TF-IDF vectorizer + KNN classifier
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_df=0.95),
    KNeighborsClassifier(n_neighbors=5)  # you can tune n_neighbors as needed
)

# -------------------------
# Cross-Validation on Training Set
# -------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, train_texts, train_labels, cv=cv, scoring='f1_macro')
print("Cross-validation F1 Macro Scores on Training Set:", cv_scores)
print("Mean Cross-Validation F1 Macro Score:", np.mean(cv_scores))

# -------------------------
# Train the Model on the Full Training Set
# -------------------------
pipeline.fit(train_texts, train_labels)

# -------------------------
# Evaluation on Training Set
# -------------------------
train_preds = pipeline.predict(train_texts)
print("\nTraining Set Evaluation:")
print(classification_report(train_labels, train_preds))
print("Confusion Matrix (Training):")
print(confusion_matrix(train_labels, train_preds))

# -------------------------
# Evaluation on Validation Set
# -------------------------
val_preds = pipeline.predict(val_texts)
print("\nValidation Set Evaluation:")
print(classification_report(val_labels, val_preds))
print("Confusion Matrix (Validation):")
print(confusion_matrix(val_labels, val_preds))

# -------------------------
# Evaluation on Test Set
# -------------------------
test_preds = pipeline.predict(test_texts)
print("\nTest Set Evaluation:")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix (Test):")
print(confusion_matrix(test_labels, test_preds))

# -------------------------
# Submission File Generation
# -------------------------
submission_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/knn_baseline.jsonl'
with open(submission_file, 'w', encoding='utf-8') as f_out:
    for instance_id, pred in zip(test_ids, test_preds):
        result = {
            "id": instance_id,
            "label": int(pred),  # ensuring it's an integer (0 or 1)
            "tag": "myGroupMyMethod"
        }
        f_out.write(json.dumps(result) + "\n")
        
print(f"\nSubmission file saved to: {submission_file}")


Cross-validation F1 Macro Scores on Training Set: [0.39977975 0.40022371 0.40820958 0.38911108 0.40877802]
Mean Cross-Validation F1 Macro Score: 0.4012204296342684

Training Set Evaluation:
              precision    recall  f1-score   support

           0       0.67      0.98      0.80      7541
           1       0.70      0.09      0.15      3946

    accuracy                           0.67     11487
   macro avg       0.69      0.53      0.48     11487
weighted avg       0.68      0.67      0.58     11487

Confusion Matrix (Training):
[[7395  146]
 [3604  342]]

Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.64      0.98      0.77      2075
           1       0.43      0.03      0.06      1182

    accuracy                           0.63      3257
   macro avg       0.53      0.50      0.42      3257
weighted avg       0.56      0.63      0.51      3257

Confusion Matrix (Validation):
[[2024   51]
 [1144   38]]

Test Set Eval