In [1]:
"""
phq_question_selector.py
Reusable pipeline for:
- Cleaning question text
- Embedding with SentenceTransformer
- Semantic deduplication
- Selecting conversational questions aligned with PHQ-8 categories
"""

import re
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Optional, Dict, Any

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean_question(text: str) -> str:
    """Normalize & clean text (underscores, parentheses, lowercase, etc.)"""
    text = str(text)

    # remove parentheses content
    text = re.sub(r"\([^)]*\)", "", text)

    # underscores → spaces
    text = text.replace("_", " ")

    # lowercase + strip
    text = text.lower().strip()

    # collapse multiple spaces
    text = re.sub(r"\s+", " ", text)

    return text

In [3]:
def semantic_deduplicate(
    questions: List[str],
    embeddings: np.ndarray,
    threshold: float = 0.85
) -> List[int]:
    """
    Remove near-duplicate questions using cosine similarity.
    threshold = similarity above which two questions count as duplicates.
    Returns: list of indices to keep.
    """
    keep = []

    for i in range(len(questions)):
        emb_i = embeddings[i].reshape(1, -1)
        is_dup = False

        for k in keep:
            if cosine_similarity(emb_i, embeddings[k].reshape(1, -1))[0][0] >= threshold:
                is_dup = True
                break

        if not is_dup:
            keep.append(i)

    return keep

In [4]:
class PHQQuestionSelector:

    def __init__(
        self,
        model_name: str = "all-MiniLM-L6-v2",
        sim_threshold_within: float = 0.90,
        sim_threshold_to_phq: float = 0.20,
        global_unique: bool = True
    ):
        self.model = SentenceTransformer(model_name)
        self.sim_threshold_within = sim_threshold_within
        self.sim_threshold_to_phq = sim_threshold_to_phq
        self.global_unique = global_unique

        # Default PHQ-8 questions
        self.phq8_questions = [
            "Little interest or pleasure in doing things",
            "Feeling down, depressed, or hopeless",
            "Trouble falling or staying asleep, or sleeping too much",
            "Feeling tired or having little energy",
            "Poor appetite or overeating",
            "Feeling bad about yourself — or that you are a failure or have let yourself or your family down",
            "Trouble concentrating on things, such as reading the newspaper or watching television",
            "Moving or speaking so slowly that other people could have noticed or the opposite — being fidgety or restless"
        ]

        self.train_df = None
        self.train_emb = None
        self.phq_emb = self.model.encode(
            self.phq8_questions, normalize_embeddings=True
        )
        
    def load_questions(self, csv_path: str, text_col: str = "value"):
        df = pd.read_csv(csv_path)

        if text_col not in df.columns:
            raise ValueError(
                f"Column '{text_col}' not found. Available: {df.columns.tolist()}"
            )

        # clean + dedup text-wise (exact)
        df = df[[text_col]].dropna().drop_duplicates().reset_index(drop=True)

        df["cleaned"] = df[text_col].apply(clean_question)
        df = df.drop_duplicates(subset="cleaned")

        self.train_df = df.reset_index(drop=True)
        return self
    
    def embed_and_dedup(self, threshold: float = 0.85):
        """
        Embed all questions then apply semantic deduplication.
        """
        if self.train_df is None:
            raise RuntimeError("Call load_questions() first.")

        text_list = self.train_df["cleaned"].tolist()

        # embed
        emb = self.model.encode(
            text_list, convert_to_numpy=True, normalize_embeddings=True
        )

        # deduplicate using semantic similarity
        keep = semantic_deduplicate(text_list, emb, threshold)
        self.train_df = self.train_df.iloc[keep].reset_index(drop=True)
        self.train_emb = emb[keep]

        return self

    def _pick_unique_for_phq(
        self,
        candidate_indices,
        sim_to_phq,
        n_pick,
        used_global
    ):

        picked = []

        for idx in candidate_indices:
            if len(picked) >= n_pick:
                break

            # ensure similarity to PHQ
            if sim_to_phq[idx] < self.sim_threshold_to_phq:
                continue

            # ensure global uniqueness
            if self.global_unique and idx in used_global:
                continue

            # avoid paraphrases within this PHQ category
            if picked:
                sims = cosine_similarity(
                    self.train_emb[idx:idx+1],
                    self.train_emb[picked]
                ).max()
                if sims > self.sim_threshold_within:
                    continue

            # accept
            picked.append(idx)
            used_global.add(idx)

        return picked
    
    def select_questions(
        self,
        top_k_candidates: int = 30,
        n_per_phq: int = 3
    ) -> pd.DataFrame:

        if self.train_emb is None:
            raise RuntimeError("Call embed_and_dedup() first.")

        # similarity matrix (train × phq)
        sim_matrix = np.dot(self.train_emb, self.phq_emb.T)

        results = []
        used_global = set()

        for phq_idx, phq_text in enumerate(self.phq8_questions):

            sims = sim_matrix[:, phq_idx]
            candidate_order = np.argsort(-sims)[:top_k_candidates]

            picked = self._pick_unique_for_phq(
                candidate_order,
                sims,
                n_pick=n_per_phq,
                used_global=used_global
            )

            # store results
            for rank, idx in enumerate(picked):
                results.append({
                    "phq_idx": phq_idx,
                    "phq_question": phq_text,
                    "selected_question": self.train_df.loc[idx, "cleaned"],
                    "similarity_to_phq": float(sims[idx]),
                    "rank": rank + 1,
                    "train_index": int(idx)
                })

        return pd.DataFrame(results)
    
    def save(self, df: pd.DataFrame, out_path: str):
        df.to_csv(out_path, index=False)
        return out_path

In [5]:
# from phq_question_selector import PHQQuestionSelector

selector = (
    PHQQuestionSelector(
        model_name="all-MiniLM-L6-v2",
        sim_threshold_within=0.90,
        sim_threshold_to_phq=0.20,
        global_unique=True
    )
    .load_questions("/Volumes/MACBACKUP/train_ellie_questions.csv", text_col="value")
    .embed_and_dedup(threshold=0.85)    # semantic deduplication
)

df = selector.select_questions(
    top_k_candidates=30,
    n_per_phq=3
)

In [6]:
df

Unnamed: 0,phq_idx,phq_question,selected_question,similarity_to_phq,rank,train_index
0,0,Little interest or pleasure in doing things,what do you enjoy about traveling,0.382603,1,77
1,0,Little interest or pleasure in doing things,do fun,0.372254,2,247
2,0,Little interest or pleasure in doing things,shyoutgoing,0.362278,3,176
3,1,"Feeling down, depressed, or hopeless",feel down,0.608196,1,193
4,1,"Feeling down, depressed, or hopeless",do you feel down,0.580347,2,81
5,1,"Feeling down, depressed, or hopeless",depression diagnosed,0.568466,3,191
6,2,"Trouble falling or staying asleep, or sleeping...",sleep affects,0.540299,1,239
7,2,"Trouble falling or staying asleep, or sleeping...",what are you like when you don't sleep well,0.531716,2,45
8,2,"Trouble falling or staying asleep, or sleeping...",easy sleep,0.460325,3,192
9,3,Feeling tired or having little energy,how have you been feeling lately,0.441128,1,138


In [None]:
selector.save(df, "phq8_selected_questions.csv")