In [2]:
from dotenv import load_dotenv
from persuasion_bias.datasets.persuasion import PersuasionDatasetLoader

__ = load_dotenv()
loader = PersuasionDatasetLoader()
documents = loader.load_from_huggingface()

In [3]:
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from langchain_chroma.vectorstores import Chroma
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from pydantic import Field


class PersuasivenessRetriever(BaseRetriever):
    """------------------------------------------------------------+
    |Custom retriever that can filter by persuasiveness and source |
    +------------------------------------------------------------"""

    # BaseRetriever uses Pydantic
    vectorstore: Chroma = Field(...)

    def _get_relevant_documents(
        self,
        query: str,
        *,
        run_manager: CallbackManagerForRetrieverRun,
        source_filter: str | None = None,
        min_persuasiveness: float | None = None,
        k: int = 5,
    ) -> list[Document]:
        """Retrieve documents with optional filtering"""

        # Overfetch - Filter - Return top-k
        search_k = min(k * 3, 50)
        docs = self.vectorstore.similarity_search(query, k=search_k)

        filtered_docs = []
        for doc in docs:
            if source_filter and doc.metadata.get("source") != source_filter:
                continue

            if min_persuasiveness is not None:
                if doc.metadata.get("persuasiveness_delta", 0) < min_persuasiveness:
                    continue

            filtered_docs.append(doc)

            # Break when we have enough documents
            if len(filtered_docs) >= k:
                break

        return filtered_docs[:k]

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(
    model="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

vectorstore = Chroma.from_documents(documents=documents, embedding=embedding)
retriever = PersuasivenessRetriever(vectorstore=vectorstore)

In [8]:
import pandas as pd

docs = retriever.invoke("technology")
metadata = [doc.metadata for doc in docs]
docs_pd = pd.DataFrame(data=metadata)
docs_pd[["claim", "persuasiveness_delta", "is_human", "prompt_type"]]

Unnamed: 0,claim,persuasiveness_delta,is_human,prompt_type
0,Smartphones/social media should not be banned ...,1,False,Logical Reasoning
1,Smartphones/social media should not be banned ...,0,False,Logical Reasoning
2,Smartphones/social media should not be banned ...,1,False,Logical Reasoning
3,Smartphones/social media should not be banned ...,2,False,Expert Writer Rhetorics
4,Smartphones/social media should not be banned ...,0,False,Expert Writer Rhetorics
