## AI Solutions Engineer Interview – Simple Risk Scoring Script

**Author:** Tinomutendayi Muzondidya  
**Time spent:** 1 hr 45 minutes  

### Dataset  
`health_ai_whatsapp_100_conversations_long.txt`

### Task  
1. Analyse a conversation to produce risk scores for:  
   - **a)** HIV acquisition  
   - **b)** Mental health disorder  

2. Provide a recommendation and treatment plan based on **South African NDoH guidelines**, specifically:  
   - *ART Clinical Guidelines 2023*  
   - *National Mental Health Policy Framework 2023–2030*  


# Setup and Inputs

In [None]:


from dataclasses import dataclass
from typing import List, Dict
import re
import pandas as pd

DATA_PATH = "health_ai_whatsapp_100_conversations_long.txt"


# Load & Parse Conversations

In [6]:

@dataclass
class ConversationRiskResult:
    conversation_id: int
    hiv_risk_score: float          # 0.0–1.0
    hiv_risk_level: str            # "low" | "moderate" | "high"
    mental_health_risk_score: float
    mental_health_risk_level: str
    hiv_recommendation: str
    mental_health_recommendation: str


def load_conversations(path: str) -> List[str]:
    """
    Load the raw WhatsApp-style conversations and split them into individual conversations.
    The file is formatted with '========== Conversation ==========' separators.
    """
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()

    chunks = [c.strip() for c in raw.split("========== Conversation ==========") if c.strip()]
    return chunks


def extract_user_text(conversation: str) -> str:
    """
    Extract only the user's messages from a conversation block.
    
    """
    user_lines: List[str] = []

    for line in conversation.splitlines():
        line = line.strip()
        if not line:
            continue

        # Try to find "User:" after the timestamp
        if "] User:" in line:
            msg = line.split("] User:", 1)[1]
        elif "User:" in line:
            msg = line.split("User:", 1)[1]
        else:
            continue

        user_lines.append(msg.strip())

    return " ".join(user_lines)


# Keyword Dictionaries & Scoring Helpers

In [7]:


# HIV acquisition risk keywords
HIV_KEYWORDS: Dict[str, Dict[str, List[str]]] = {
    "unprotected_sex": {
        "weight": 0.45,
        "patterns": [
            r"without a condom",
            r"no condom",
            r"didn.?t use (a )?condom",
            r"condom broke",
            r"condom slipped",
        ],
    },
    "sti_symptoms": {
        "weight": 0.25,
        "patterns": [
            r"genital (sore|sores|ulcer|ulcers|blister|blisters)",
            r"burn(s|ing)? when (i )?pee",
            r"penile discharge",
            r"vaginal discharge",
            r"smelly discharge",
        ],
    },
    "multiple_partners": {
        "weight": 0.15,
        "patterns": [
            r"multiple partners?",
            r"more than one (guy|girl|partner)",
            r"one.?night stand",
            r"hook.?up",
            r"cheated",
            r"affair",
            r"sex worker",
        ],
    },
    "partner_hiv_positive_or_unknown": {
        "weight": 0.25,
        "patterns": [
            r"partner.*(hiv\+|hiv positive)",
            r"he is hiv positive",
            r"she is hiv positive",
            r"on (arvs|art)",
            r"don.?t know.*partner.?s? status",
        ],
    },
    "sexual_assault": {
        "weight": 0.4,
        "patterns": [
            r"rap(ed|e)",
            r"sexual assault",
            r"forced me",
            r"i didn.?t consent",
        ],
    },
    "needle_or_injection": {
        "weight": 0.2,
        "patterns": [
            r"shared (a )?needle",
            r"inject(ing)? drugs?",
        ],
    },
}

# Mental health risk keywords
MH_KEYWORDS: Dict[str, Dict[str, List[str]]] = {
    "depression": {
        "weight": 0.25,
        "patterns": [
            r"\b(i feel|feeling) (sad|down|empty|numb)",
            r"no longer enjoy",
            r"lost interest",
            r"no energy",
            r"tired of life",
            r"worthless",
            r"guilty all the time",
        ],
    },
    "anxiety": {
        "weight": 0.2,
        "patterns": [
            r"\b(anxious|anxiety|panic attack|panic attacks)",
            r"heart (is )?racing",
            r"can.?t (breathe|catch my breath)",
            r"on edge",
            r"constant worry",
        ],
    },
    "suicidality_or_self_harm": {
        "weight": 0.6,
        "patterns": [
            r"kill myself",
            r"end it all",
            r"don.?t want to live",
            r"rather be dead",
            r"suicide",
            r"hurt myself",
            r"self.?harm",
            r"cut myself",
        ],
    },
    "psychosis": {
        "weight": 0.3,
        "patterns": [
            r"hearing voices",
            r"voices in my head",
            r"seeing things",
            r"people are watching me",
            r"people.*out to get me",
        ],
    },
    "sleep_appetite_concentration": {
        "weight": 0.15,
        "patterns": [
            r"can.?t sleep",
            r"insomnia",
            r"sleeping too much",
            r"no appetite",
            r"not eating",
            r"lost weight",
            r"difficult(y)? concentrating",
        ],
    },
    "substance_use": {
        "weight": 0.15,
        "patterns": [
            r"drinking a lot",
            r"drink too much",
            r"getting drunk every day",
            r"using (weed|dagga|marijuana|drugs|tik|nyaope)",
        ],
    },
}


def _keyword_score(text: str, keyword_dict: Dict[str, Dict[str, List[str]]]) -> float:
    """
    Generic keyword scoring: add weights when ANY pattern in a category matches.
    Score is clipped to 1.0.
    """
    text = text.lower()
    score = 0.0

    for cat, cfg in keyword_dict.items():
        weight = cfg["weight"]
        patterns = cfg["patterns"]

        if any(re.search(pat, text) for pat in patterns):
            score += weight

    return min(score, 1.0)


# Risk Scoring & Recommendations

In [8]:


def compute_hiv_risk(user_text: str) -> float:
    """Simple HIV acquisition risk score (0–1) based on user text."""
    return _keyword_score(user_text, HIV_KEYWORDS)


def compute_mental_health_risk(user_text: str) -> float:
    """Simple mental health disorder risk score (0–1) based on user text."""
    return _keyword_score(user_text, MH_KEYWORDS)


def risk_level(score: float) -> str:
    """Convert numeric score to low / moderate / high."""
    if score < 0.3:
        return "low"
    elif score < 0.6:
        return "moderate"
    else:
        return "high"


def generate_hiv_recommendation(score: float) -> str:
    level = risk_level(score)

    if level == "low":
        return (
            "HIV risk appears LOW based on this conversation. "
            "Provide basic HIV prevention counselling: consistent condom use, "
            "knowing both partners' HIV status, and access to HIV testing at a "
            "local clinic. Offer HIV testing as per routine testing guidelines."
        )

    if level == "moderate":
        return (
            "HIV risk appears MODERATE. Recommend facility-based HIV testing as "
            "soon as possible and screening for other sexually transmitted infections. "
            "If there was a recent (≤72 hours) high-risk exposure, a clinician should "
            "assess for HIV post-exposure prophylaxis (PEP). If HIV is diagnosed, "
            "initiate same-day ART where feasible and perform baseline clinical and "
            "laboratory evaluation."
        )

    # High
    return (
        "HIV risk appears HIGH. Advise IMMEDIATE presentation to the nearest clinic "
        "or emergency department for urgent HIV testing and clinical assessment. "
        "A clinician should evaluate for HIV post-exposure prophylaxis (PEP) if the "
        "exposure was within the last 72 hours, and screen for STIs, pregnancy, and "
        "sexual assault as relevant. If HIV is confirmed, start ART urgently using "
        "a dolutegravir-based regimen with baseline TB and mental health screening."
    )


def generate_mental_health_recommendation(score: float) -> str:
    level = risk_level(score)

    if level == "low":
        return (
            "Mental health risk appears LOW. Provide psycho-education on stress, "
            "sleep hygiene and coping skills. Normalise help-seeking and inform the "
            "user that they may visit a primary healthcare clinic or community "
            "counsellor if symptoms persist or worsen."
        )

    if level == "moderate":
        return (
            "Mental health risk appears MODERATE. Recommend a routine mental health "
            "assessment at the nearest primary healthcare clinic or by an accredited "
            "counsellor. Management may include brief counselling, support groups, "
            "and, if indicated, medication under clinical supervision. Monitor for "
            "any emergence of suicidal thoughts or self-harm and escalate care if present."
        )

    # High
    return (
        "Mental health risk appears HIGH. This suggests possible significant "
        "depression, anxiety, substance use or psychotic symptoms. Recommend "
        "URGENT same-day assessment at a clinic or emergency unit. A clinician "
        "should screen specifically for suicidal or self-harm thoughts, psychosis, "
        "and substance use, and consider emergency referral to specialist mental "
        "health services if safety is a concern. Provide crisis support information "
        "and ensure the person is not left alone if there is immediate risk."
    )


# Full Conversation Analysis Function

In [9]:


def analyse_conversation(conversation_id: int, conversation_text: str) -> ConversationRiskResult:
    """
    Full pipeline for a single conversation:
        - extract user text
        - compute HIV & mental health risk scores
        - map to risk levels
        - generate recommendations
    """
    user_text = extract_user_text(conversation_text)

    hiv_score = compute_hiv_risk(user_text)
    mh_score = compute_mental_health_risk(user_text)

    hiv_level = risk_level(hiv_score)
    mh_level = risk_level(mh_score)

    hiv_rec = generate_hiv_recommendation(hiv_score)
    mh_rec = generate_mental_health_recommendation(mh_score)

    return ConversationRiskResult(
        conversation_id=conversation_id,
        hiv_risk_score=round(hiv_score, 2),
        hiv_risk_level=hiv_level,
        mental_health_risk_score=round(mh_score, 2),
        mental_health_risk_level=mh_level,
        hiv_recommendation=hiv_rec,
        mental_health_recommendation=mh_rec,
    )


# Run on All Conversations & Build DataFrame

In [10]:
conversations = load_conversations(DATA_PATH)
len(conversations)


100

In [11]:


results = []

for idx, conv in enumerate(conversations):
    out = analyse_conversation(idx, conv)

    results.append({
        "conversation_id": out.conversation_id,
        "hiv_risk_score": out.hiv_risk_score,
        "hiv_risk_level": out.hiv_risk_level,
        "mental_health_risk_score": out.mental_health_risk_score,
        "mental_health_risk_level": out.mental_health_risk_level,
        "hiv_recommendation": out.hiv_recommendation,
        "mental_health_recommendation": out.mental_health_recommendation,
    })

df = pd.DataFrame(results)
df.head()


Unnamed: 0,conversation_id,hiv_risk_score,hiv_risk_level,mental_health_risk_score,mental_health_risk_level,hiv_recommendation,mental_health_recommendation
0,0,0.0,low,0.0,low,HIV risk appears LOW based on this conversatio...,Mental health risk appears LOW. Provide psycho...
1,1,0.0,low,0.0,low,HIV risk appears LOW based on this conversatio...,Mental health risk appears LOW. Provide psycho...
2,2,0.0,low,0.0,low,HIV risk appears LOW based on this conversatio...,Mental health risk appears LOW. Provide psycho...
3,3,0.0,low,0.0,low,HIV risk appears LOW based on this conversatio...,Mental health risk appears LOW. Provide psycho...
4,4,0.0,low,0.0,low,HIV risk appears LOW based on this conversatio...,Mental health risk appears LOW. Provide psycho...


In [12]:
# Summary of HIV and mental health risk levels

summary_hiv = df["hiv_risk_level"].value_counts().rename("HIV Risk Count")
summary_mh = df["mental_health_risk_level"].value_counts().rename("Mental Health Risk Count")

# Combine into one summary table
summary_table = pd.concat([summary_hiv, summary_mh], axis=1).fillna(0).astype(int)

print("=== Risk Level Summary ===")
display(summary_table)

print("\n=== HIV Risk Distribution ===")
display(summary_hiv)

print("\n=== Mental Health Risk Distribution ===")
display(summary_mh)


=== Risk Level Summary ===


Unnamed: 0,HIV Risk Count,Mental Health Risk Count
low,100,100



=== HIV Risk Distribution ===


hiv_risk_level
low    100
Name: HIV Risk Count, dtype: int64


=== Mental Health Risk Distribution ===


mental_health_risk_level
low    100
Name: Mental Health Risk Count, dtype: int64