In [2]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold

def load_dataset(responses_file, labels_file):
    """
    Load dataset by reading responses and labels from JSONL files and merging them.
    """
    # Load responses into a dictionary mapping id -> response text
    responses = {}
    with open(responses_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            # Here you can combine 'query' and 'response' if needed.
            responses[data["id"]] = data["response"]
    
    # Load labels and merge with responses
    ids, texts, labels = [], [], []
    with open(labels_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            instance_id = data["id"]
            if instance_id in responses:
                ids.append(instance_id)
                texts.append(responses[instance_id])
                labels.append(data["label"])
    
    return ids, texts, labels

# File paths (update these paths as needed)
train_responses_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train.jsonl'
train_labels_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train-labels.jsonl'
val_responses_file   = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation.jsonl'
val_labels_file      = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation-labels.jsonl'
test_responses_file  = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test.jsonl'
test_labels_file     = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test-labels.jsonl'

# Load datasets
train_ids, train_texts, train_labels = load_dataset(train_responses_file, train_labels_file)
val_ids, val_texts, val_labels = load_dataset(val_responses_file, val_labels_file)
test_ids, test_texts, test_labels = load_dataset(test_responses_file, test_labels_file)

# -----------------------------------------------------
# Experiment: Naïve Bayes Classifier (NBC) with TF-IDF
# Classifier: Multinomial Naïve Bayes
# Feature Extraction: TF-IDF
# Goal: Evaluate performance of NBC on the TF-IDF representation of the responses.
# -----------------------------------------------------

# Build the pipeline: TF-IDF vectorizer + MultinomialNB
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_df=0.95),
    MultinomialNB()
)

# -------------------------
# Cross-Validation on Training Set
# -------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, train_texts, train_labels, cv=cv, scoring='f1_macro')
print("Cross-validation F1 Macro Scores on Training Set:", cv_scores)
print("Mean Cross-Validation F1 Macro Score:", np.mean(cv_scores))

# -------------------------
# Train the Model on the Full Training Set
# -------------------------
pipeline.fit(train_texts, train_labels)

# -------------------------
# Evaluation on Training Set
# -------------------------
train_preds = pipeline.predict(train_texts)
print("\nTraining Set Evaluation:")
print(classification_report(train_labels, train_preds))
print("Confusion Matrix (Training):")
print(confusion_matrix(train_labels, train_preds))

# -------------------------
# Evaluation on Validation Set
# -------------------------
val_preds = pipeline.predict(val_texts)
print("\nValidation Set Evaluation:")
print(classification_report(val_labels, val_preds))
print("Confusion Matrix (Validation):")
print(confusion_matrix(val_labels, val_preds))

# -------------------------
# Evaluation on Test Set
# -------------------------
test_preds = pipeline.predict(test_texts)
print("\nTest Set Evaluation:")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix (Test):")
print(confusion_matrix(test_labels, test_preds))

# -------------------------
# Submission File Generation
# -------------------------
submission_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/nbc_baseline.jsonl'
with open(submission_file, 'w', encoding='utf-8') as f_out:
    for instance_id, pred in zip(test_ids, test_preds):
        result = {
            "id": instance_id,
            "label": int(pred),  # ensuring it's an integer (0 or 1)
            "tag": "myGroupMyMethod"
        }
        f_out.write(json.dumps(result) + "\n")
        
print(f"\nSubmission file saved to: {submission_file}")


Cross-validation F1 Macro Scores on Training Set: [0.42259069 0.41886949 0.42067236 0.41584924 0.41454145]
Mean Cross-Validation F1 Macro Score: 0.41850464756880407

Training Set Evaluation:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80      7541
           1       0.98      0.07      0.13      3946

    accuracy                           0.68     11487
   macro avg       0.82      0.54      0.47     11487
weighted avg       0.78      0.68      0.57     11487

Confusion Matrix (Training):
[[7534    7]
 [3663  283]]

Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.65      1.00      0.79      2075
           1       0.95      0.05      0.09      1182

    accuracy                           0.65      3257
   macro avg       0.80      0.52      0.44      3257
weighted avg       0.76      0.65      0.53      3257

Confusion Matrix (Validation):
[[2072    3]
 [1124   58]]

Test Set Eva

### uses GridSearchCV to tune the smoothing parameter (alpha) for the Multinomial Naïve Bayes classifier in a pipeline with TF-IDF feature extraction 

In [3]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

def load_dataset(responses_file, labels_file):
    """
    Load dataset by reading responses and labels from JSONL files and merging them.
    """
    # Load responses into a dictionary mapping id -> response text
    responses = {}
    with open(responses_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            # Optionally combine 'query' and 'response' if needed.
            responses[data["id"]] = data["response"]
    
    # Load labels and merge with responses
    ids, texts, labels = [], [], []
    with open(labels_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            instance_id = data["id"]
            if instance_id in responses:
                ids.append(instance_id)
                texts.append(responses[instance_id])
                labels.append(data["label"])
    
    return ids, texts, labels

# File paths (update these paths as needed)
train_responses_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train.jsonl'
train_labels_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train-labels.jsonl'
val_responses_file   = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation.jsonl'
val_labels_file      = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation-labels.jsonl'
test_responses_file  = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test.jsonl'
test_labels_file     = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test-labels.jsonl'

# Load datasets
train_ids, train_texts, train_labels = load_dataset(train_responses_file, train_labels_file)
val_ids, val_texts, val_labels = load_dataset(val_responses_file, val_labels_file)
test_ids, test_texts, test_labels = load_dataset(test_responses_file, test_labels_file)

# -----------------------------------------------------
# Experiment: GridSearch on Multinomial Naïve Bayes with TF-IDF
# Classifier: Multinomial Naïve Bayes
# Feature Extraction: TF-IDF
# Goal: Use GridSearchCV to experiment with different alpha values 
#       (e.g., 0.001, 0.01, 0.1, 1.0, 10) to find the optimal smoothing parameter.
# -----------------------------------------------------

# Build the pipeline: TF-IDF vectorizer + MultinomialNB
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_df=0.95),
    MultinomialNB()
)

# Define the parameter grid for alpha
param_grid = {
    'multinomialnb__alpha': [0.001, 0.01, 0.1, 1.0, 10]
}

# Setup grid search with 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)
grid_search.fit(train_texts, train_labels)

# Print best alpha and corresponding score
print("Best alpha:", grid_search.best_params_['multinomialnb__alpha'])
print("Best CV F1 Macro Score:", grid_search.best_score_)

# Use the best model from grid search for evaluation
best_model = grid_search.best_estimator_

# -------------------------
# Evaluation on Training Set
# -------------------------
train_preds = best_model.predict(train_texts)
print("\nTraining Set Evaluation:")
print(classification_report(train_labels, train_preds))
print("Confusion Matrix (Training):")
print(confusion_matrix(train_labels, train_preds))

# -------------------------
# Evaluation on Validation Set
# -------------------------
val_preds = best_model.predict(val_texts)
print("\nValidation Set Evaluation:")
print(classification_report(val_labels, val_preds))
print("Confusion Matrix (Validation):")
print(confusion_matrix(val_labels, val_preds))

# -------------------------
# Evaluation on Test Set
# -------------------------
test_preds = best_model.predict(test_texts)
print("\nTest Set Evaluation:")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix (Test):")
print(confusion_matrix(test_labels, test_preds))

# -------------------------
# Submission File Generation
# -------------------------
submission_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/nb_gridsearch.jsonl'
with open(submission_file, 'w', encoding='utf-8') as f_out:
    for instance_id, pred in zip(test_ids, test_preds):
        result = {
            "id": instance_id,
            "label": int(pred),  # ensuring it's an integer (0 or 1)
            "tag": "myGroupMyMethod"
        }
        f_out.write(json.dumps(result) + "\n")
        
print(f"\nSubmission file saved to: {submission_file}")


Best alpha: 0.1
Best CV F1 Macro Score: 0.5423436706182134

Training Set Evaluation:
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      7541
           1       0.87      0.61      0.72      3946

    accuracy                           0.84     11487
   macro avg       0.85      0.78      0.80     11487
weighted avg       0.84      0.84      0.83     11487

Confusion Matrix (Training):
[[7194  347]
 [1543 2403]]

Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.74      0.97      0.84      2075
           1       0.87      0.40      0.55      1182

    accuracy                           0.76      3257
   macro avg       0.80      0.68      0.69      3257
weighted avg       0.79      0.76      0.73      3257

Confusion Matrix (Validation):
[[2005   70]
 [ 712  470]]

Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.73      0.97      0.8

# Complement Naive Bayes

In [2]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold

def load_dataset(responses_file, labels_file):
    """
    Load dataset by reading responses and labels from JSONL files and merging them.
    """
    # Load responses into a dictionary mapping id -> response text
    responses = {}
    with open(responses_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            # Optionally combine 'query' and 'response' if needed.
            responses[data["id"]] = data["response"]
    
    # Load labels and merge with responses
    ids, texts, labels = [], [], []
    with open(labels_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            instance_id = data["id"]
            if instance_id in responses:
                ids.append(instance_id)
                texts.append(responses[instance_id])
                labels.append(data["label"])
    
    return ids, texts, labels

# File paths (update these paths as needed)
train_responses_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train.jsonl'
train_labels_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train-labels.jsonl'
val_responses_file   = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation.jsonl'
val_labels_file      = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation-labels.jsonl'
test_responses_file  = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test.jsonl'
test_labels_file     = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test-labels.jsonl'

# Load datasets
train_ids, train_texts, train_labels = load_dataset(train_responses_file, train_labels_file)
val_ids, val_texts, val_labels = load_dataset(val_responses_file, val_labels_file)
test_ids, test_texts, test_labels = load_dataset(test_responses_file, test_labels_file)

# -----------------------------------------------------
# Experiment: Complement Naïve Bayes (ComplementNB) with TF-IDF
# Classifier: Complement Naïve Bayes
# Feature Extraction: TF-IDF
# Goal: Evaluate performance of ComplementNB, which is often more robust for imbalanced data.
# -----------------------------------------------------

# Build the pipeline: TF-IDF vectorizer + ComplementNB
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_df=0.95),
    ComplementNB(alpha=1.0)
)

# -------------------------
# Cross-Validation on Training Set
# -------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, train_texts, train_labels, cv=cv, scoring='f1_macro')
print("Cross-validation F1 Macro Scores on Training Set:", cv_scores)
print("Mean Cross-Validation F1 Macro Score:", np.mean(cv_scores))

# -------------------------
# Train the Model on the Full Training Set
# -------------------------
pipeline.fit(train_texts, train_labels)

# -------------------------
# Evaluation on Training Set
# -------------------------
train_preds = pipeline.predict(train_texts)
print("\nTraining Set Evaluation:")
print(classification_report(train_labels, train_preds))
print("Confusion Matrix (Training):")
print(confusion_matrix(train_labels, train_preds))

# -------------------------
# Evaluation on Validation Set
# -------------------------
val_preds = pipeline.predict(val_texts)
print("\nValidation Set Evaluation:")
print(classification_report(val_labels, val_preds))
print("Confusion Matrix (Validation):")
print(confusion_matrix(val_labels, val_preds))

# -------------------------
# Evaluation on Test Set
# -------------------------
test_preds = pipeline.predict(test_texts)
print("\nTest Set Evaluation:")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix (Test):")
print(confusion_matrix(test_labels, test_preds))

# -------------------------
# Submission File Generation
# -------------------------
submission_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/comp_nb_baseline.jsonl'
with open(submission_file, 'w', encoding='utf-8') as f_out:
    for instance_id, pred in zip(test_ids, test_preds):
        result = {
            "id": instance_id,
            "label": int(pred),  # ensuring it's an integer (0 or 1)
            "tag": "myGroupMyMethod"
        }
        f_out.write(json.dumps(result) + "\n")
        
print(f"\nSubmission file saved to: {submission_file}")


Cross-validation F1 Macro Scores on Training Set: [0.48632078 0.50805709 0.49582183 0.50031821 0.49875823]
Mean Cross-Validation F1 Macro Score: 0.4978552285260798

Training Set Evaluation:
              precision    recall  f1-score   support

           0       0.74      0.97      0.84      7541
           1       0.86      0.34      0.49      3946

    accuracy                           0.75     11487
   macro avg       0.80      0.65      0.66     11487
weighted avg       0.78      0.75      0.72     11487

Confusion Matrix (Training):
[[7314  227]
 [2604 1342]]

Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.70      0.97      0.82      2075
           1       0.85      0.28      0.42      1182

    accuracy                           0.72      3257
   macro avg       0.78      0.63      0.62      3257
weighted avg       0.76      0.72      0.67      3257

Confusion Matrix (Validation):
[[2015   60]
 [ 850  332]]

Test Set Eval