# Baseline for Native Advertisement Detection Using TF-IDF and Logistic Regression

1. Data Loading: Merges responses and labels from JSONL files.
2. Feature Extraction: Transforms text into TF-IDF features (with stop-word removal and max_df filtering).
3. Classification: Uses Logistic Regression (max_iter=1000, random_state=42) for binary classification.
4. Evaluation: Applies 5-fold cross-validation and evaluates on training, validation, and test sets.
5. Submission: Generates predictions in JSONL format.

In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

def load_dataset(responses_file, labels_file):
    """
    Load dataset by reading responses and labels from JSONL files and merging them.
    """
    # Load responses into a dictionary mapping id -> response text
    responses = {}
    with open(responses_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            responses[data["id"]] = data["response"]
    
    # Load labels and merge with responses
    ids, texts, labels = [], [], []
    with open(labels_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            instance_id = data["id"]
            if instance_id in responses:
                ids.append(instance_id)
                texts.append(responses[instance_id])
                labels.append(data["label"])
    
    return ids, texts, labels

# File paths (update these paths as needed)
train_responses_file = '/Users/tarekbouhairi/Desktop/my_submission/Dataset/responses-train.jsonl'
train_labels_file = '/Users/tarekbouhairi/Desktop/my_submission/Dataset/responses-train-labels.jsonl'
val_responses_file   = '/Users/tarekbouhairi/Desktop/my_submission/Dataset/responses-validation.jsonl'
val_labels_file      = '/Users/tarekbouhairi/Desktop/my_submission/Dataset/responses-validation-labels.jsonl'
test_responses_file  = '/Users/tarekbouhairi/Desktop/my_submission/Dataset/responses-test.jsonl'
test_labels_file     = '/Users/tarekbouhairi/Desktop/my_submission/Dataset/responses-test-labels.jsonl'

# Load train and validation datasets separately
train_ids, train_texts, train_labels = load_dataset(train_responses_file, train_labels_file)
val_ids, val_texts, val_labels = load_dataset(val_responses_file, val_labels_file)

# Combine train and validation sets into one training set
combined_ids = train_ids + val_ids
combined_texts = train_texts + val_texts
combined_labels = train_labels + val_labels

# Load test dataset
test_ids, test_texts, test_labels = load_dataset(test_responses_file, test_labels_file)

# -----------------------------------------------------
# Experiment: Logistic Regression with TF-IDF for Advertisement Detection
# Classifier: Logistic Regression (predicts label 1 as advertisement)
# Feature Extraction: TF-IDF
# Goal: Evaluate performance specifically for detecting advertisements (label 1)
# -----------------------------------------------------

# Build the pipeline: TF-IDF vectorizer + Logistic Regression
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_df=0.95),
    LogisticRegression(class_weight='balanced',max_iter=1000, random_state=42)
)

# -------------------------
# Cross-Validation on Combined Training Set
# -------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Use scoring 'f1' which, for binary classification, computes the F1 score for the positive class (label 1)
cv_scores = cross_val_score(pipeline, combined_texts, combined_labels, cv=cv, scoring='f1')
print("Cross-validation F1 Score (for ad detection, label=1) on Combined Training Set:", cv_scores)
print("Mean Cross-validation F1 Score:", np.mean(cv_scores))

# -------------------------
# Train the Model on the Full Combined Training Set
# -------------------------
pipeline.fit(combined_texts, combined_labels)

# -------------------------
# Evaluation on Combined Training Set
# -------------------------
train_preds = pipeline.predict(combined_texts)
train_report_dict = classification_report(combined_labels, train_preds, output_dict=True)
train_cm = confusion_matrix(combined_labels, train_preds)
# Save training evaluation to CSV
df_train_report = pd.DataFrame(train_report_dict).transpose()
df_train_report.to_csv('/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/csv-results/baseline/train_classification_report.csv', index=True)

df_train_cm = pd.DataFrame(train_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_train_cm.to_csv('/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/csv-results/baseline/train_confusion_matrix.csv', index=True)

print("\nCombined Training Set Evaluation:")
print(classification_report(combined_labels, train_preds))
print("Confusion Matrix (Combined Training):")
print(confusion_matrix(combined_labels, train_preds))
# Calculate F1 Score for label 1 explicitly:
print("F1 Score for ads (label 1):", f1_score(combined_labels, train_preds, pos_label=1))

# -------------------------
# Evaluation on Test Set
# -------------------------
test_preds = pipeline.predict(test_texts)
test_report_dict = classification_report(test_labels, test_preds, output_dict=True)
test_cm = confusion_matrix(test_labels, test_preds)

# Save test evaluation to CSV
df_test_report = pd.DataFrame(test_report_dict).transpose()
df_test_report.to_csv('/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/csv-results/baseline/test_classification_report.csv', index=True)

df_test_cm = pd.DataFrame(test_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_test_cm.to_csv('/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/csv-results/baseline/test_confusion_matrix.csv', index=True)


print("\nTest Set Evaluation:")
print(classification_report(test_labels, test_preds))
cm = confusion_matrix(test_labels, test_preds)
print("Confusion Matrix (Test):")
print(cm)
print("F1 Score for ads (label 1):", f1_score(test_labels, test_preds, pos_label=1))

# Calculate additional metrics for label 1 based on the confusion matrix
TN, FP, FN, TP = cm[0,0], cm[0,1], cm[1,0], cm[1,1]
detection_accuracy = TP / (TP + FN) if (TP + FN) > 0 else 0  # Recall for ads
false_negative_rate = FN / (TP + FN) if (TP + FN) > 0 else 0
false_positive_rate = FP / (FP + TN) if (FP + TN) > 0 else 0

print("Detection Accuracy for ads (label 1):", detection_accuracy)
print("False Negative Rate for ads (label 1):", false_negative_rate)
print("False Positive Rate for ads (label 1):", false_positive_rate)

# Additional snippet to calculate and print the F1 score for detecting ads (label 1)
# Here, y_true is test_labels and y_pred is test_preds
f1_ads = f1_score(test_labels, test_preds, pos_label=1)
print("F1-score for detecting ads:", f1_ads)

# Calculate additional metrics for label 0 (non-ads) by treating label 0 as the positive class
# For label 0:
#   True Positives (TP_0) = TN (non-ads correctly predicted as non-ads)
#   False Negatives (FN_0) = FP (non-ads predicted as ads)
#   False Positives (FP_0) = FN (ads predicted as non-ads)
#   True Negatives (TN_0) = TP (ads correctly predicted as ads)
detection_accuracy_non_ads = TN / (TN + FP) if (TN + FP) > 0 else 0  # Recall for non-ads
false_negative_rate_non_ads = FP / (TN + FP) if (TN + FP) > 0 else 0
false_positive_rate_non_ads = FN / (FN + TP) if (FN + TP) > 0 else 0

print("Detection Accuracy for non-ads (label 0):", detection_accuracy_non_ads)
print("False Negative Rate for non-ads (label 0):", false_negative_rate_non_ads)
print("False Positive Rate for non-ads (label 0):", false_positive_rate_non_ads)
# Additional snippet to calculate and print the F1 score for detecting ads (label 0)
# Here, y_true is test_labels and y_pred is test_preds
f1_ads = f1_score(test_labels, test_preds, pos_label=0)
print("F1-score for non-detecting ads:", f1_ads)

# -------------------------
# Submission File Generation
# -------------------------
submission_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/Submission/Tf-IDF-logReg.jsonl'
with open(submission_file, 'w', encoding='utf-8') as f_out:
    for instance_id, pred in zip(test_ids, test_preds):
        result = {
            "id": instance_id,
            "label": int(pred),  # ensuring it's an integer (0 or 1)
            "tag": "myGroupMyMethod"
        }
        f_out.write(json.dumps(result) + "\n")
        
print(f"\nSubmission file saved to: {submission_file}")


Cross-validation F1 Score (for ad detection, label=1) on Combined Training Set: [0.71892925 0.72276265 0.71688823 0.73277968 0.73611794]
Mean Cross-validation F1 Score: 0.7254955492618423

Combined Training Set Evaluation:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      9616
           1       0.85      0.89      0.87      5128

    accuracy                           0.91     14744
   macro avg       0.89      0.90      0.90     14744
weighted avg       0.91      0.91      0.91     14744

Confusion Matrix (Combined Training):
[[8801  815]
 [ 551 4577]]
F1 Score for ads (label 1): 0.8701520912547529

Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1687
           1       0.80      0.71      0.76       913

    accuracy                           0.84      2600
   macro avg       0.83      0.81      0.82      2600
weighted avg       0.84      0.84      0.84 