# Baseline for Native Advertisement Detection Using TF-IDF and Logistic Regression

1. Data Loading: Merges responses and labels from JSONL files.
2. Feature Extraction: Transforms text into TF-IDF features (with stop-word removal and max_df filtering).
3. Classification: Uses Logistic Regression (max_iter=1000, random_state=42) for binary classification.
4. Evaluation: Applies 5-fold cross-validation and evaluates on training, validation, and test sets.
5. Submission: Generates predictions in JSONL format.

In [1]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold

def load_dataset(responses_file, labels_file):
    """
    Load dataset by reading responses and labels from JSONL files and merging them.
    """
    # Load responses into a dictionary mapping id -> response text
    responses = {}
    with open(responses_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            # Here you can combine 'query' and 'response' if needed.
            responses[data["id"]] = data["response"]
    
    # Load labels and merge with responses
    ids, texts, labels = [], [], []
    with open(labels_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            instance_id = data["id"]
            if instance_id in responses:
                ids.append(instance_id)
                texts.append(responses[instance_id])
                labels.append(data["label"])
    
    return ids, texts, labels

# File paths (update these paths as needed)
train_responses_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train.jsonl'
train_labels_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train-labels.jsonl'
val_responses_file   = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation.jsonl'
val_labels_file      = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation-labels.jsonl'
test_responses_file  = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test.jsonl'
test_labels_file     = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test-labels.jsonl'

# Load datasets
train_ids, train_texts, train_labels = load_dataset(train_responses_file, train_labels_file)
val_ids, val_texts, val_labels = load_dataset(val_responses_file, val_labels_file)
test_ids, test_texts, test_labels = load_dataset(test_responses_file, test_labels_file)

# Build the pipeline: TF-IDF vectorizer + Logistic Regression
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_df=0.95),
    LogisticRegression(max_iter=1000, random_state=42)
)

# -------------------------
# Cross-Validation on Training Set
# -------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, train_texts, train_labels, cv=cv, scoring='f1_macro')
print("Cross-validation F1 Macro Scores on Training Set:", cv_scores)
print("Mean Cross-Validation F1 Macro Score:", np.mean(cv_scores))

# -------------------------
# Train the Model on the Full Training Set
# -------------------------
pipeline.fit(train_texts, train_labels)

# -------------------------
# Evaluation on Training Set
# -------------------------
train_preds = pipeline.predict(train_texts)
print("\nTraining Set Evaluation:")
print(classification_report(train_labels, train_preds))
print("Confusion Matrix (Training):")
print(confusion_matrix(train_labels, train_preds))

# -------------------------
# Evaluation on Validation Set
# -------------------------
val_preds = pipeline.predict(val_texts)
print("\nValidation Set Evaluation:")
print(classification_report(val_labels, val_preds))
print("Confusion Matrix (Validation):")
print(confusion_matrix(val_labels, val_preds))

# -------------------------
# Evaluation on Test Set
# -------------------------
test_preds = pipeline.predict(test_texts)
print("\nTest Set Evaluation:")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix (Test):")
print(confusion_matrix(test_labels, test_preds))

# -------------------------
# Submission File Generation
# -------------------------
submission_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/baseline.jsonl'
with open(submission_file, 'w', encoding='utf-8') as f_out:
    for instance_id, pred in zip(test_ids, test_preds):
        result = {
            "id": instance_id,
            "label": int(pred),  # ensuring it's an integer (0 or 1)
            "tag": "myGroupMyMethod"
        }
        f_out.write(json.dumps(result) + "\n")
        
print(f"\nSubmission file saved to: {submission_file}")


Cross-validation F1 Macro Scores on Training Set: [0.67887449 0.6722082  0.69065799 0.66471172 0.67489588]
Mean Cross-Validation F1 Macro Score: 0.6762696572899445

Training Set Evaluation:
              precision    recall  f1-score   support

           0       0.81      0.98      0.89      7541
           1       0.94      0.57      0.71      3946

    accuracy                           0.84     11487
   macro avg       0.88      0.78      0.80     11487
weighted avg       0.86      0.84      0.83     11487

Confusion Matrix (Training):
[[7408  133]
 [1688 2258]]

Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.73      0.97      0.83      2075
           1       0.87      0.37      0.52      1182

    accuracy                           0.75      3257
   macro avg       0.80      0.67      0.68      3257
weighted avg       0.78      0.75      0.72      3257

Confusion Matrix (Validation):
[[2008   67]
 [ 743  439]]

Test Set Eval

# Fixed Version 

In [2]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

def load_dataset(responses_file, labels_file):
    """
    Load dataset by reading responses and labels from JSONL files and merging them.
    """
    # Load responses into a dictionary mapping id -> response text
    responses = {}
    with open(responses_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            responses[data["id"]] = data["response"]
    
    # Load labels and merge with responses
    ids, texts, labels = [], [], []
    with open(labels_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            instance_id = data["id"]
            if instance_id in responses:
                ids.append(instance_id)
                texts.append(responses[instance_id])
                labels.append(data["label"])
    
    return ids, texts, labels

# File paths (update these paths as needed)
train_responses_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train.jsonl'
train_labels_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train-labels.jsonl'
val_responses_file   = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation.jsonl'
val_labels_file      = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation-labels.jsonl'
test_responses_file  = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test.jsonl'
test_labels_file     = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test-labels.jsonl'

# Load train and validation datasets separately
train_ids, train_texts, train_labels = load_dataset(train_responses_file, train_labels_file)
val_ids, val_texts, val_labels = load_dataset(val_responses_file, val_labels_file)

# Combine train and validation sets into one training set
combined_ids = train_ids + val_ids
combined_texts = train_texts + val_texts
combined_labels = train_labels + val_labels

# Load test dataset
test_ids, test_texts, test_labels = load_dataset(test_responses_file, test_labels_file)

# -----------------------------------------------------
# Experiment: Logistic Regression with TF-IDF for Advertisement Detection
# Classifier: Logistic Regression (predicts label 1 as advertisement)
# Feature Extraction: TF-IDF
# Goal: Evaluate performance specifically for detecting advertisements (label 1)
# -----------------------------------------------------

# Build the pipeline: TF-IDF vectorizer + Logistic Regression
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_df=0.95),
    LogisticRegression(max_iter=1000, random_state=42)
)

# -------------------------
# Cross-Validation on Combined Training Set
# -------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Use scoring 'f1' which for binary classification computes F1 score for the positive class (label 1)
cv_scores = cross_val_score(pipeline, combined_texts, combined_labels, cv=cv, scoring='f1')
print("Cross-validation F1 Score (for ad detection, label=1) on Combined Training Set:", cv_scores)
print("Mean Cross-Validation F1 Score:", np.mean(cv_scores))

# -------------------------
# Train the Model on the Full Combined Training Set
# -------------------------
pipeline.fit(combined_texts, combined_labels)

# -------------------------
# Evaluation on Combined Training Set
# -------------------------
train_preds = pipeline.predict(combined_texts)
print("\nCombined Training Set Evaluation:")
print(classification_report(combined_labels, train_preds))
print("Confusion Matrix (Combined Training):")
print(confusion_matrix(combined_labels, train_preds))
# Calculate F1 Score for label 1 explicitly:
print("F1 Score for ads (label 1):", f1_score(combined_labels, train_preds, pos_label=1))

# -------------------------
# Evaluation on Test Set
# -------------------------
test_preds = pipeline.predict(test_texts)
print("\nTest Set Evaluation:")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix (Test):")
print(confusion_matrix(test_labels, test_preds))
print("F1 Score for ads (label 1):", f1_score(test_labels, test_preds, pos_label=1))

# -------------------------
# Submission File Generation
# -------------------------
submission_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/baseline2.jsonl'
with open(submission_file, 'w', encoding='utf-8') as f_out:
    for instance_id, pred in zip(test_ids, test_preds):
        result = {
            "id": instance_id,
            "label": int(pred),  # ensuring it's an integer (0 or 1)
            "tag": "myGroupMyMethod"
        }
        f_out.write(json.dumps(result) + "\n")
        
print(f"\nSubmission file saved to: {submission_file}")


Cross-validation F1 Score (for ad detection, label=1) on Combined Training Set: [0.57534247 0.57050453 0.56031128 0.57591623 0.57217505]
Mean Cross-Validation F1 Score: 0.5708499113935824

Combined Training Set Evaluation:
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      9616
           1       0.94      0.62      0.75      5128

    accuracy                           0.85     14744
   macro avg       0.89      0.80      0.82     14744
weighted avg       0.87      0.85      0.85     14744

Confusion Matrix (Combined Training):
[[9424  192]
 [1948 3180]]
F1 Score for ads (label 1): 0.7482352941176471

Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.77      0.97      0.86      1687
           1       0.89      0.45      0.60       913

    accuracy                           0.79      2600
   macro avg       0.83      0.71      0.73      2600
weighted avg       0.81      0.79      0.77 

In [1]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

def load_dataset(responses_file, labels_file):
    """
    Load dataset by reading responses and labels from JSONL files and merging them.
    """
    # Load responses into a dictionary mapping id -> response text
    responses = {}
    with open(responses_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            responses[data["id"]] = data["response"]
    
    # Load labels and merge with responses
    ids, texts, labels = [], [], []
    with open(labels_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            instance_id = data["id"]
            if instance_id in responses:
                ids.append(instance_id)
                texts.append(responses[instance_id])
                labels.append(data["label"])
    
    return ids, texts, labels

# File paths (update these paths as needed)
train_responses_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train.jsonl'
train_labels_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train-labels.jsonl'
val_responses_file   = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation.jsonl'
val_labels_file      = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation-labels.jsonl'
test_responses_file  = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test.jsonl'
test_labels_file     = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test-labels.jsonl'

# Load train and validation datasets separately
train_ids, train_texts, train_labels = load_dataset(train_responses_file, train_labels_file)
val_ids, val_texts, val_labels = load_dataset(val_responses_file, val_labels_file)

# Combine train and validation sets into one training set
combined_ids = train_ids + val_ids
combined_texts = train_texts + val_texts
combined_labels = train_labels + val_labels

# Load test dataset
test_ids, test_texts, test_labels = load_dataset(test_responses_file, test_labels_file)

# -----------------------------------------------------
# Experiment: Logistic Regression with TF-IDF for Advertisement Detection
# Classifier: Logistic Regression (predicts label 1 as advertisement)
# Feature Extraction: TF-IDF
# Goal: Evaluate performance specifically for detecting advertisements (label 1)
# -----------------------------------------------------

# Build the pipeline: TF-IDF vectorizer + Logistic Regression
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_df=0.95),
    LogisticRegression(max_iter=1000, random_state=42)
)

# -------------------------
# Cross-Validation on Combined Training Set
# -------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Use scoring 'f1' which, for binary classification, computes the F1 score for the positive class (label 1)
cv_scores = cross_val_score(pipeline, combined_texts, combined_labels, cv=cv, scoring='f1')
print("Cross-validation F1 Score (for ad detection, label=1) on Combined Training Set:", cv_scores)
print("Mean Cross-validation F1 Score:", np.mean(cv_scores))

# -------------------------
# Train the Model on the Full Combined Training Set
# -------------------------
pipeline.fit(combined_texts, combined_labels)

# -------------------------
# Evaluation on Combined Training Set
# -------------------------
train_preds = pipeline.predict(combined_texts)
print("\nCombined Training Set Evaluation:")
print(classification_report(combined_labels, train_preds))
print("Confusion Matrix (Combined Training):")
print(confusion_matrix(combined_labels, train_preds))
# Calculate F1 Score for label 1 explicitly:
print("F1 Score for ads (label 1):", f1_score(combined_labels, train_preds, pos_label=1))

# -------------------------
# Evaluation on Test Set
# -------------------------
test_preds = pipeline.predict(test_texts)
print("\nTest Set Evaluation:")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix (Test):")
print(confusion_matrix(test_labels, test_preds))
print("F1 Score for ads (label 1):", f1_score(test_labels, test_preds, pos_label=1))

# Additional snippet to calculate and print the F1 score for detecting ads (label 1)
# Here, y_true is test_labels and y_pred is test_preds
f1_ads = f1_score(test_labels, test_preds, pos_label=1)
print("F1-score for detecting ads:", f1_ads)

# -------------------------
# Submission File Generation
# -------------------------
submission_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/Submission/baseline2.jsonl'
with open(submission_file, 'w', encoding='utf-8') as f_out:
    for instance_id, pred in zip(test_ids, test_preds):
        result = {
            "id": instance_id,
            "label": int(pred),  # ensuring it's an integer (0 or 1)
            "tag": "myGroupMyMethod"
        }
        f_out.write(json.dumps(result) + "\n")
        
print(f"\nSubmission file saved to: {submission_file}")


Cross-validation F1 Score (for ad detection, label=1) on Combined Training Set: [0.57534247 0.57050453 0.56031128 0.57591623 0.57217505]
Mean Cross-validation F1 Score: 0.5708499113935824

Combined Training Set Evaluation:
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      9616
           1       0.94      0.62      0.75      5128

    accuracy                           0.85     14744
   macro avg       0.89      0.80      0.82     14744
weighted avg       0.87      0.85      0.85     14744

Confusion Matrix (Combined Training):
[[9424  192]
 [1948 3180]]
F1 Score for ads (label 1): 0.7482352941176471

Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.77      0.97      0.86      1687
           1       0.89      0.45      0.60       913

    accuracy                           0.79      2600
   macro avg       0.83      0.71      0.73      2600
weighted avg       0.81      0.79      0.77 