In [2]:
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold


def load_dataset(responses_file, labels_file):
    """
    Load dataset by reading responses and labels from JSONL files and merging them.
    """
    # Load responses into a dictionary mapping id -> response text
    responses = {}
    with open(responses_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            # Here you can combine 'query' and 'response' if needed.
            responses[data["id"]] = data["response"]
    
    # Load labels and merge with responses
    ids, texts, labels = [], [], []
    with open(labels_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            instance_id = data["id"]
            if instance_id in responses:
                ids.append(instance_id)
                texts.append(responses[instance_id])
                labels.append(data["label"])
    
    return ids, texts, labels

# File paths (update these paths as needed)
train_responses_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train.jsonl'
train_labels_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-train-labels.jsonl'
val_responses_file   = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation.jsonl'
val_labels_file      = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-validation-labels.jsonl'
test_responses_file  = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test.jsonl'
test_labels_file     = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/responses-test-labels.jsonl'

# Load datasets
train_ids, train_texts, train_labels = load_dataset(train_responses_file, train_labels_file)
val_ids, val_texts, val_labels = load_dataset(val_responses_file, val_labels_file)
test_ids, test_texts, test_labels = load_dataset(test_responses_file, test_labels_file)

# -----------------------------------------------------
# Experiment: Random Forest with TF-IDF
# Classifier: Random Forest
# Feature Extraction: TF-IDF
# Goal: Evaluate performance of Random Forest on the TF-IDF representation of the responses.
# -----------------------------------------------------

# Build the pipeline: TF-IDF vectorizer + RandomForestClassifier
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_df=0.95),
    RandomForestClassifier(class_weight='balanced',n_estimators=100, random_state=42)
)

# -------------------------
# Cross-Validation on Training Set
# -------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, train_texts, train_labels, cv=cv, scoring='f1_macro')
print("Cross-validation F1 Macro Scores on Training Set:", cv_scores)
print("Mean Cross-Validation F1 Macro Score:", np.mean(cv_scores))

# -------------------------
# Train the Model on the Full Training Set
# -------------------------
pipeline.fit(train_texts, train_labels)

# -------------------------
# Evaluation on Training Set
# -------------------------
train_preds = pipeline.predict(train_texts)
train_report_dict = classification_report(train_labels, train_preds, output_dict=True)
train_cm = confusion_matrix(train_labels, train_preds)

# Save training evaluation to CSV
df_train_report = pd.DataFrame(train_report_dict).transpose()
df_train_report.to_csv('/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/csv-results/RandomForest/train_classification_report.csv', index=True)

df_train_cm = pd.DataFrame(train_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_train_cm.to_csv('/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/csv-results/RandomForest/train_confusion_matrix.csv', index=True)

print("\nTraining Set Evaluation:")
print(classification_report(train_labels, train_preds))
print("Confusion Matrix (Training):")
print(confusion_matrix(train_labels, train_preds))

# -------------------------
# Evaluation on Validation Set
# -------------------------
val_preds = pipeline.predict(val_texts)
val_report_dict = classification_report(val_labels, val_preds, output_dict=True)
val_cm = confusion_matrix(val_labels, val_preds)

# Save validation evaluation to CSV
df_val_report = pd.DataFrame(val_report_dict).transpose()
df_val_report.to_csv('/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/csv-results/RandomForest/val_classification_report.csv', index=True)

df_val_cm = pd.DataFrame(val_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_val_cm.to_csv('/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/csv-results/RandomForest/val_confusion_matrix.csv', index=True)

print("\nValidation Set Evaluation:")
print(classification_report(val_labels, val_preds))
print("Confusion Matrix (Validation):")
print(confusion_matrix(val_labels, val_preds))

# -------------------------
# Evaluation on Test Set
# -------------------------
test_preds = pipeline.predict(test_texts)
test_report_dict = classification_report(test_labels, test_preds, output_dict=True)
test_cm = confusion_matrix(test_labels, test_preds)

# Save test evaluation to CSV
df_test_report = pd.DataFrame(test_report_dict).transpose()
df_test_report.to_csv('/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/csv-results/RandomForest/test_classification_report.csv', index=True)

df_test_cm = pd.DataFrame(test_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_test_cm.to_csv('/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/Advertisment-Detection/csv-results/RandomForest/test_confusion_matrix.csv', index=True)

print("\nTest Set Evaluation:")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix (Test):")
print(confusion_matrix(test_labels, test_preds))

# -------------------------
# Submission File Generation
# -------------------------
submission_file = '/Users/tarekbouhairi/Desktop/Universitat Passau/Advertismenet in RAG/Ad-Detection/rf_baseline.jsonl'
with open(submission_file, 'w', encoding='utf-8') as f_out:
    for instance_id, pred in zip(test_ids, test_preds):
        result = {
            "id": instance_id,
            "label": int(pred),  # ensuring it's an integer (0 or 1)
            "tag": "myGroupMyMethod"
        }
        f_out.write(json.dumps(result) + "\n")
        
print(f"\nSubmission file saved to: {submission_file}")


Cross-validation F1 Macro Scores on Training Set: [0.77733886 0.76368942 0.76125587 0.77536341 0.76139126]
Mean Cross-Validation F1 Macro Score: 0.767807763929605

Training Set Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7541
           1       1.00      1.00      1.00      3946

    accuracy                           1.00     11487
   macro avg       1.00      1.00      1.00     11487
weighted avg       1.00      1.00      1.00     11487

Confusion Matrix (Training):
[[7541    0]
 [   0 3946]]

Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.88      0.93      0.90      2075
           1       0.87      0.78      0.82      1182

    accuracy                           0.88      3257
   macro avg       0.87      0.85      0.86      3257
weighted avg       0.87      0.88      0.87      3257

Confusion Matrix (Validation):
[[1933  142]
 [ 265  917]]

Test Set Evalu