# Full RAG pipeline

In [1]:
pip install faiss-cpu sentence-transformers transformers accelerate

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
pip install openai

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
pip install pandas numpy 

[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
pip install spacy

[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
pip install hf_xet



[0mNote: you may need to restart the kernel to use updated packages.


## Advertisement Classification Pipeline

1. **Helpers**  
   Define `load_jsonl` and `load_and_label` to read JSONL files and merge examples with their labels.

2. **Load & Merge Train + Validation Data**  
   - Load `responses-train.jsonl` & `responses-train-labels.jsonl`  
   - Load `responses-validation.jsonl` & `responses-validation-labels.jsonl`  
   - Combine both into a single `train_data` pool.

3. **Index by Topic & Label**  
   Organize `train_data` into `docs_by_topic_label[meta_topic][label]` for per-topic, per-label grouping.

4. **Embed All Responses**  
   Use `SentenceTransformer(EMBED_MODEL)` to compute L2-normalized vector embeddings for every response.

5. **Build & Save FAISS Indices**  
   For each topic and label:  
   - Create a flat inner-product index (`IndexFlatIP`)  
   - Add all embeddings  
   - Serialize `(faiss_indices, plain_docs)` to `faiss_indices2.pkl`.

6. **Load FAISS Indices**  
   Load back the pickle to restore `faiss_indices` and document lookups for inference.

7. **Query Embedding Function**  
   Define `embed_query(text)` to produce a normalized embedding for any new response.

8. **Retrieve Top-K per Label**  
   - **If topic exists:** search that topic’s indices for top `K_PER_LABEL` neighbors in each label  
   - **Else fallback:** pool all docs globally and pick top K per label by inner product.

9. **Rerank with Cross-Encoder**  
   Score the retrieved candidates with `CrossEncoder(RERANKER_MODEL)` and select the top `FINAL_TOP_M`, enforcing up to two examples per label.

10. **Prompt Construction & Classification**  
    Build a zero-shot prompt injecting the reranked examples, call the LLM (`MODEL_NAME`) with `temperature=0.0`, and output a binary label (0 or 1).

11. **Evaluation on Test Set**  
    - Load test responses & labels  
    - Classify each example through the full pipeline  
    - Compute and print the confusion matrix and detailed classification report.


In [6]:
import os
import json
import pickle
import numpy as np
import faiss
import tqdm

from pathlib import Path
from collections import defaultdict
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
from sklearn.metrics import confusion_matrix, classification_report
from tqdm import tqdm
from itertools import chain

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# ─── CONFIG ─────────────────────────────────────────────────────────────────────
API_KEY    = os.getenv("OPENAI_API_KEY", "sk-8TwcodIENxaHJBtD7-bF7A")
BASE_URL   = "https://llms-inference.innkube.fim.uni-passau.de"
MODEL_NAME = "llama3.1"

EMBED_MODEL    = "all-MiniLM-L6-v2"
RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"

K_PER_LABEL     = 5   # retrieve per label
FINAL_TOP_M     = 4   # rerank down to 4 total
DIST_THRESHOLD  = 0.0 # not used for embeddings

In [8]:
# ─── STEP 0: Helpers ─────────────────────────────────────────────────────────────
def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]


In [9]:
# ─── STEP 1: LOAD & MERGE TRAIN + VALIDATION DATA ─────────────────────────────
def load_and_label(jsonl_path, labels_path):
    examples = load_jsonl(jsonl_path)
    labels   = load_jsonl(labels_path)
    lbl_map  = {l['id']: l['label'] for l in labels}
    merged   = []
    for r in examples:
        if r['id'] in lbl_map:
            merged.append({
                'id':         r['id'],
                'meta_topic': r['meta_topic'],
                'response':   r['response'],
                'label':      lbl_map[r['id']]
            })
    return merged

# load train
train_data = load_and_label(
    "Dataset/responses-train.jsonl",
    "Dataset/responses-train-labels.jsonl"
)

# load validation
valid_data = load_and_label(
    "Dataset/responses-validation.jsonl",
    "Dataset/responses-validation-labels.jsonl"
)

# merge them into one big “train” pool
train_data.extend(valid_data)

# ─── STEP 2: INDEX BY TOPIC & LABEL ─────────────────────────────────────────────
docs_by_topic_label = defaultdict(lambda: defaultdict(list))
for doc in train_data:
    docs_by_topic_label[doc['meta_topic']][doc['label']].append(doc)

# ─── STEP 3: EMBED ALL RESPONSES ────────────────────────────────────────────────
embedder = SentenceTransformer(EMBED_MODEL)
for topic, labels in docs_by_topic_label.items():
    for label, docs in labels.items():
        texts = [d['response'] for d in docs]
        embs  = embedder.encode(texts, show_progress_bar=True, normalize_embeddings=True)
        for doc, emb in zip(docs, embs):
            doc['embedding'] = emb

# ─── STEP 4: BUILD & SAVE FAISS INDEX (fixed) ───────────────────────────────────
faiss_indices = {}
for topic, labels in docs_by_topic_label.items():
    faiss_indices[topic] = {}
    for label, docs in labels.items():
        dim   = docs[0]['embedding'].shape[0]
        index = faiss.IndexFlatIP(dim)
        array = np.stack([d['embedding'] for d in docs])
        index.add(array)
        faiss_indices[topic][label] = index

# Convert docs_by_topic_label (a defaultdict) into a nested plain dict
plain_docs = {
    topic: { label: docs for label, docs in label_dict.items() }
    for topic, label_dict in docs_by_topic_label.items()
}

with open("RAG FAISS/faiss_indices2.pkl", "wb") as f:
    # Now both objects are picklable!
    pickle.dump((faiss_indices, plain_docs), f)


Batches: 100%|██████████| 16/16 [00:00<00:00, 18.83it/s]
Batches: 100%|██████████| 32/32 [00:00<00:00, 36.16it/s]
Batches: 100%|██████████| 18/18 [00:00<00:00, 32.88it/s]
Batches: 100%|██████████| 38/38 [00:01<00:00, 34.97it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 33.90it/s]
Batches: 100%|██████████| 39/39 [00:01<00:00, 36.82it/s]
Batches: 100%|██████████| 13/13 [00:00<00:00, 34.15it/s]
Batches: 100%|██████████| 27/27 [00:00<00:00, 33.98it/s]
Batches: 100%|██████████| 17/17 [00:00<00:00, 34.02it/s]
Batches: 100%|██████████| 29/29 [00:00<00:00, 35.04it/s]
Batches: 100%|██████████| 15/15 [00:00<00:00, 34.31it/s]
Batches: 100%|██████████| 30/30 [00:00<00:00, 37.72it/s]
Batches: 100%|██████████| 21/21 [00:00<00:00, 36.41it/s]
Batches: 100%|██████████| 39/39 [00:01<00:00, 35.77it/s]
Batches: 100%|██████████| 19/19 [00:00<00:00, 35.22it/s]
Batches: 100%|██████████| 35/35 [00:00<00:00, 36.72it/s]
Batches: 100%|██████████| 13/13 [00:00<00:00, 32.43it/s]
Batches: 100%|██████████| 22/22

In [10]:
# ─── STEP 5: LOAD FAISS INDEX (fixed) ───────────────────────────────────────────
with open("RAG FAISS/faiss_indices2.pkl", "rb") as f:
    faiss_indices, plain_docs = pickle.load(f)

# If you still want a defaultdict structure:
docs_by_topic_label = defaultdict(lambda: defaultdict(list))
for topic, label_dict in plain_docs.items():
    for label, docs in label_dict.items():
        docs_by_topic_label[topic][label] = docs


# ─── STEP 6: QUERY EMBEDDING FUNCTION ───────────────────────────────────────────
def embed_query(text):
    emb = embedder.encode([text], normalize_embeddings=True)
    return emb

# ─── STEP 7: RETRIEVE TOP-K PER LABEL ───────────────────────────────────────────
def retrieve_by_label(query_emb, topic, k=K_PER_LABEL):
    # ─── Case 1: topic exists ────────────────────────
    if topic in faiss_indices:
        pool = {}
        for label in (0, 1):
            idx = faiss_indices[topic].get(label)
            if idx is None: 
                continue
            D, I = idx.search(query_emb, k)
            docs = docs_by_topic_label[topic].get(label, [])
            pool[label] = [docs[i] for i in I[0] if i != -1]
        return pool

    # ─── Case 2: topic missing — global fallback ─────
    # Flatten all docs across all topics
    all_docs = []
    for t, label_dict in docs_by_topic_label.items():
        for docs in label_dict.values():
            all_docs.extend(docs)

    # Stack embeddings and compute cosine similarities
    emb_matrix = np.stack([d['embedding'] for d in all_docs])  # shape: [N, dim]
    # since embeddings were normalized, inner product == cosine
    sims = (emb_matrix @ query_emb.T).squeeze()                # shape: [N]

    # Sort descending and pick top k per label
    sorted_idxs = np.argsort(sims)[::-1]
    pool = {0: [], 1: []}
    for idx in sorted_idxs:
        doc = all_docs[idx]
        lab = doc['label']
        if len(pool[lab]) < k:
            pool[lab].append(doc)
        # stop once both labels have k each
        if len(pool[0]) >= k and len(pool[1]) >= k:
            break

    return pool

In [11]:
# ─── STEP 8: RERANK WITH CROSSENCODER ──────────────────────────────────────────
#Modified to always have the same number of advertisements and no-advertisements examples to limit the bias.
reranker = CrossEncoder(RERANKER_MODEL)

def rerank_pool(response, pool, top_m=FINAL_TOP_M):
    #print(pool)
    candidates_no_ad = pool[0]
    #print(candidates_no_ad)
    pairs_no_ad      = [(response, d['response']) for d in candidates_no_ad]
    scores_no_ad     = reranker.predict(pairs_no_ad)
    idxs_no_ad       = np.argsort(scores_no_ad)[-top_m:][::-1]
    selected_no_ad   = [candidates_no_ad[i] for i in idxs_no_ad]

    candidates_ad = pool[1]
    pairs_ad      = [(response, d['response']) for d in candidates_ad]
    scores_ad     = reranker.predict(pairs_ad)
    idxs_ad       = np.argsort(scores_ad)[-top_m:][::-1]
    selected_ad   = [candidates_ad[i] for i in idxs_ad]
    # enforce 2 per label if possible
    out =  list(chain.from_iterable(zip(selected_ad[:top_m//2],selected_no_ad[:top_m//2])))
    #for d in selected_ad:
    #    if len(out[d['label']]) < 2:
    #        out[d['label']].append(d)
    return out

In [None]:
# ─── NEW STEP 9: PROMPT & CLASSIFY ─────────────────────────────────────────────────
# Modified with some prompt engineering and use of a stronger model.
client = OpenAI()

def make_prompt(response, context):
    p = "Examples:\n"
    for c in context:
        label_example = "AD" if c['label']==1 else "NO_AD"
        p += f"{{text: \"{c['response']}\", label: {label_example}}}\n"
    p += f"\nResponse:\n\"{response}\"\nLabel (NO_AD or AD):"
    return p

def classify(response, topic):
    system_prompt = "You are a helpful assistant. Your goal is to classify the last text you received as an advertisement or not.\nAn advertisement promotes a product, service, or event whereas non-advertisements only state objective information about the product, service, or event. Be sure to correctly capture the nuances between advertisements and informative texts.\nYou will be provided examples. Using your knowledge and these examples if they are relevant, output ONLY AD if the last Response is an advertisement or output NO_AD if it is not an advertisement."
    q_emb = embed_query(response)
    pool  = retrieve_by_label(q_emb, topic)
    if not any(pool.values()):
        return 0, []
    ctx   = rerank_pool(response, pool)
    prompt = make_prompt(response, ctx)
    #print(prompt)
    resp   = client.chat.completions.create(
        model       = "gpt-4o",
        messages    = [{"role":"system", "content": system_prompt},{"role":"user","content":prompt}],
        temperature = 0.0,
        max_tokens  = 2
    )
    out = resp.choices[0].message.content.strip()
    return (1 if out.startswith("AD") else 0), ctx

In [12]:
# ─── llama3.1 NEW STEP 9: PROMPT & CLASSIFY ─────────────────────────────────────────────────
# Modified with some prompt engineering and use of a stronger model.
client = OpenAI(
    api_key=API_KEY,
    base_url=BASE_URL, 
)

def make_prompt(response, context):
    p = "Examples:\n"
    for c in context:
        label_example = "AD" if c['label']==1 else "NO_AD"
        p += f"{{text: \"{c['response']}\", label: {label_example}}}\n"
    p += f"\nResponse:\n\"{response}\"\nLabel (NO_AD or AD):"
    return p

def classify(response, topic):
    system_prompt = "You are a helpful assistant. Your goal is to classify the last text you received as an advertisement or not.\nAn advertisement promotes a product, service, or event whereas non-advertisements only state objective information about the product, service, or event. Be sure to correctly capture the nuances between advertisements and informative texts.\nYou will be provided examples. Using your knowledge and these examples if they are relevant, output ONLY AD if the last Response is an advertisement or output NO_AD if it is not an advertisement."
    q_emb = embed_query(response)
    pool  = retrieve_by_label(q_emb, topic)
    if not any(pool.values()):
        return 0, []
    ctx   = rerank_pool(response, pool)
    prompt = make_prompt(response, ctx)
    #print(prompt)
    resp   = client.chat.completions.create(
        model       = MODEL_NAME,
        messages    = [{"role":"system", "content": system_prompt},{"role":"user","content":prompt}],
        temperature = 0.0,
        max_tokens  = 2
    )
    out = resp.choices[0].message.content.strip()
    return (1 if out.startswith("AD") else 0), ctx

In [13]:
# ─── llama3.1 STEP 10: EVALUATE ON TEST SET ─────────────────────────────────────────────
# 10.1 load & merge test
test_resp = load_jsonl("/root/Ad-Detection/Expirements/RAG 3/Dataset/responses-test.jsonl")
test_lbl  = load_jsonl("/root/Ad-Detection/Expirements/RAG 3/Dataset/responses-test-labels.jsonl")
test_map  = {l['id']: l['label'] for l in test_lbl}

test_data = []
for r in test_resp:
    if r['id'] in test_map:
        test_data.append({
            'id':         r['id'],
            'meta_topic': r['meta_topic'],
            'response':   r['response'],
            'label':      test_map[r['id']]
        })

# 10.2 classify and collect
y_true, y_pred = [], []
for i in tqdm(range(len(test_data)), desc="Evaluation of the classifier"):
    ex=test_data[i]
    pred, _ = classify(ex['response'], ex['meta_topic'])
    y_true.append(ex['label'])
    y_pred.append(pred)

# 10.3 metrics
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=3))

Evaluation of the classifier: 100%|██████████| 2600/2600 [1:44:40<00:00,  2.42s/it]  

Confusion Matrix:
 [[785 902]
 [ 81 832]]

Classification Report:
               precision    recall  f1-score   support

           0      0.906     0.465     0.615      1687
           1      0.480     0.911     0.629       913

    accuracy                          0.622      2600
   macro avg      0.693     0.688     0.622      2600
weighted avg      0.757     0.622     0.620      2600




