In [None]:
# Cell 0 - Install dependencies
# Run once at the top of Colab

!pip install -q transformers sentence-transformers faiss-cpu streamlit fastapi "uvicorn[standard]" datasets nltk huggingface_hub

# For convenience: show versions
import pkg_resources, sys
print("python", sys.version)
for pkg in ["transformers","sentence_transformers","faiss","streamlit","datasets","nltk","huggingface_hub"]:
    try:
        print(pkg, pkg_resources.get_distribution(pkg).version)
    except Exception as e:
        print(pkg, "not found")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.8/510.8 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.2/452.2 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25h

  import pkg_resources, sys


python 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
transformers 4.56.1
sentence_transformers 5.1.0
faiss not found
streamlit 1.49.1
datasets 4.0.0
nltk 3.9.1
huggingface_hub 0.34.4


In [None]:
# Cell 1 - Hugging Face login (use env var or paste token)
# Recommended: set an env var HF_HUB_TOKEN in Colab runtime (Runtime -> Change runtime -> set env in UI)
from huggingface_hub import login
import os

hf_token = os.getenv("HF_HUB_TOKEN")
if hf_token:
    login(token=hf_token)
else:
    print("HF_HUB_TOKEN not found. You can call login(token='hf_xxx') manually (do not commit token).")
    # If you must: login(token="hf_...")   # DO NOT PUSH token to GitHub


HF_HUB_TOKEN not found. You can call login(token='hf_xxx') manually (do not commit token).


In [None]:
# Cell 2 - Imports and helper functions
import re, json, os, random
from pathlib import Path
import numpy as np
import pandas as pd

def clean_text(s):
    # basic cleaning: lowercase, remove URLs, extra spaces; keep emojis removal simple
    s = str(s)
    s = re.sub(r'http\S+','', s)
    s = re.sub(r'\s+',' ', s).strip()
    return s


In [None]:
# Cell 3 - Build or load corpus.json (empathetic templates)
# We try to load empathetic_dialogues from HF; if unavailable we create a small curated set.

from datasets import load_dataset

SAMPLE_LIMIT = 1000   # change if you want more
corpus_path = Path("corpus.json")

try:
    ds = load_dataset("empathetic_dialogues", split="train")
    # create simplified templates: map conversation to a template per emotion (use utterance)
    rows = []
    for i, item in enumerate(ds):
        if i>=SAMPLE_LIMIT: break
        emo = item.get("emotion", "neutral")
        # pick a short response (s1 or s2) as template
        utt = item.get("utterances", [])
        # dataset structure may vary; fallback to 'response' fields if present
        # We'll attempt multiple keys
        text = None
        if "utterances" in item and item["utterances"]:
            # choose last utterance text if available
            try:
                text = item["utterances"][-1].get("text", None)
            except Exception:
                text = None
        if not text:
            # fallback to any string content
            text = item.get("response", None) or item.get("sentence", None)
        if not text:
            continue
        rows.append({"emotion": emo, "template": clean_text(text)})
    if len(rows) < 200:
        raise Exception("small corpus from dataset; fallback to manual")
    df = pd.DataFrame(rows).drop_duplicates().reset_index(drop=True)
    df.to_json(corpus_path, orient="records", force_ascii=False)
    print("Saved corpus.json from empathetic_dialogues:", len(df), "templates")
except Exception as e:
    print("Could not load empathetic_dialogues or insufficient rows, creating manual corpus. Error:", e)
    manual = [
        {"emotion":"happiness", "template":"That's wonderful! What do you think made you feel this way?"},
        {"emotion":"happiness", "template":"I’m so glad to hear that — would you like to share more about it?"},
        {"emotion":"sadness", "template":"I'm really sorry you're feeling down. Do you want to tell me more about what's happening?"},
        {"emotion":"sadness", "template":"That sounds really tough. I'm here to listen if you want to talk."},
        {"emotion":"anger", "template":"I can hear how angry that made you feel. Would you like to vent about it?"},
        {"emotion":"anger", "template":"That sounds frustrating — do you want to walk through what happened?"},
        {"emotion":"neutral", "template":"Thanks for sharing. Do you want to say more about it?"},
        {"emotion":"neutral", "template":"I appreciate you telling me. What's on your mind next?"},
        # Add more to reach ~100 templates if possible...
    ]
    # replicate with small variations for demo
    while len(manual) < 200:
        for m in manual[:8]:
            manual.append({"emotion": m["emotion"], "template": m["template"]})
            if len(manual)>=200: break
    pd.DataFrame(manual).to_json(corpus_path, orient="records", force_ascii=False)
    print("Saved manual corpus.json with", len(manual), "templates.")

# show small sample
with open(corpus_path, 'r', encoding='utf-8') as f:
    sample = json.load(f)[:8]
sample


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

empathetic_dialogues.py: 0.00B [00:00, ?B/s]

Could not load empathetic_dialogues or insufficient rows, creating manual corpus. Error: Dataset scripts are no longer supported, but found empathetic_dialogues.py
Saved manual corpus.json with 200 templates.


[{'emotion': 'happiness',
  'template': "That's wonderful! What do you think made you feel this way?"},
 {'emotion': 'happiness',
  'template': 'I’m so glad to hear that — would you like to share more about it?'},
 {'emotion': 'sadness',
  'template': "I'm really sorry you're feeling down. Do you want to tell me more about what's happening?"},
 {'emotion': 'sadness',
  'template': "That sounds really tough. I'm here to listen if you want to talk."},
 {'emotion': 'anger',
  'template': 'I can hear how angry that made you feel. Would you like to vent about it?'},
 {'emotion': 'anger',
  'template': 'That sounds frustrating — do you want to walk through what happened?'},
 {'emotion': 'neutral',
  'template': 'Thanks for sharing. Do you want to say more about it?'},
 {'emotion': 'neutral',
  'template': "I appreciate you telling me. What's on your mind next?"}]

In [None]:
# Cell 4 - Load emotion detector and test on simple samples
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Recommended model for emotions:
EMO_MODEL = "bhadresh-savani/distilbert-base-uncased-emotion"

tokenizer_em = AutoTokenizer.from_pretrained(EMO_MODEL)
model_em = AutoModelForSequenceClassification.from_pretrained(EMO_MODEL)
emo_pipe = pipeline("text-classification", model=model_em, tokenizer=tokenizer_em, return_all_scores=True)

# Test quick samples
test_samples = ["I'm overjoyed today!", "I feel really low", "I'm so angry right now", "I'm okay, thanks"]
for s in test_samples:
    out = emo_pipe(s)[0]  # list of scores
    # choose top label
    top = max(out, key=lambda x: x['score'])
    print(s, "->", top['label'], f"({top['score']:.2f})")


Device set to use cpu


I'm overjoyed today! -> joy (1.00)
I feel really low -> sadness (1.00)
I'm so angry right now -> anger (1.00)
I'm okay, thanks -> joy (1.00)




In [None]:
# Cell 5 - Build embeddings using Sentence-Transformers and FAISS index
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm

embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(embed_model_name)

# load corpus
with open("corpus.json", "r", encoding="utf-8") as f:
    corpus = json.load(f)

# ensure each item has emotion and template
corpus = [c for c in corpus if "emotion" in c and "template" in c]
templates = [c["template"] for c in corpus]
emotions = [c["emotion"] for c in corpus]

# create embeddings (batch)
batch_size = 128
embs = embedder.encode(templates, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)

# build FAISS index (cosine similarity -> use normalized vectors)
d = embs.shape[1]
faiss.normalize_L2(embs)
index = faiss.IndexFlatIP(d)  # inner product on normalized vectors = cosine similarity
index.add(embs)
print("Built FAISS index:", index.ntotal)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Built FAISS index: 200


In [None]:
# Cell 6 - Retrieval function: get top-k templates filtered by detected emotion
import numpy as np

def retrieve_templates(user_text, detected_emotion, top_k=5):
    # embed
    q = embedder.encode([user_text], convert_to_numpy=True)
    faiss.normalize_L2(q)
    D, I = index.search(q, top_k*3)  # overfetch
    # flatten indices and filter by emotion
    res = []
    for idx in I[0]:
        if idx < 0 or idx >= len(templates): continue
        if emotions[idx].lower() == detected_emotion.lower():
            res.append({"template": templates[idx], "score": float(D[0][list(I[0]).index(idx)]) if len(D[0])>0 else 0.0})
        if len(res) >= top_k:
            break
    # fallback: if no same-emotion matches, just take top_k overall
    if not res:
        res = [{"template": templates[i], "score": float(D[0][j])} for j,i in enumerate(I[0][:top_k])]
    return res[:top_k]

# demo
print(retrieve_templates("I feel sad and don't want to get out of bed", "sadness", top_k=3))


[{'template': "I'm really sorry you're feeling down. Do you want to tell me more about what's happening?", 'score': 0.40030646324157715}, {'template': "I'm really sorry you're feeling down. Do you want to tell me more about what's happening?", 'score': 0.40030646324157715}, {'template': "I'm really sorry you're feeling down. Do you want to tell me more about what's happening?", 'score': 0.40030646324157715}]


In [None]:
# Cell 7 - Safety checks (basic) and prompt builder for few-shot framing
# We'll do simple keyword-based self-harm detection and a replacement response if detected.

SELF_HARM_KEYWORDS = [
    "kill myself", "end my life", "suicide", "i want to die", "hurt myself", "i'm going to end it"
]

def detect_self_harm(text):
    t = text.lower()
    for kw in SELF_HARM_KEYWORDS:
        if kw in t:
            return True
    return False

DISCLAIMER = "I’m not a therapist; please seek professional help for serious issues. If you're in immediate danger, contact local emergency services."

def build_response(user_text, detected_emotion, retrieved_templates):
    # Simple few-shot framing: combine templates into final answer
    # We will pick the top template and wrap with empathetic phrasing and disclaimer
    top_template = retrieved_templates[0]['template'] if retrieved_templates else "Thanks for sharing."
    response = f"{top_template}\n\n{DISCLAIMER}"
    return response

# test builder
print(build_response("I'm depressed", "sadness", retrieve_templates("I'm depressed","sadness",3)))


I'm really sorry you're feeling down. Do you want to tell me more about what's happening?

I’m not a therapist; please seek professional help for serious issues. If you're in immediate danger, contact local emergency services.


In [None]:
# Cell 8 - Full pipeline function
def empathy_pipeline(user_text):
    txt = clean_text(user_text)
    # safety
    if detect_self_harm(txt):
        # safe fallback
        return {
            "emotion":"self_harm_flagged",
            "response":"I'm sorry you're feeling that way. If you're in danger or thinking about harming yourself, please contact emergency services or a crisis hotline immediately. " + DISCLAIMER
        }
    # emotion detection
    scores = emo_pipe(txt)[0]
    # select top emotion label
    top = max(scores, key=lambda x: x['score'])
    emotion = top['label']  # check model labels mapping if needed
    # retrieve templates
    templates_found = retrieve_templates(txt, emotion, top_k=3)
    # build response with few-shot framing
    final = build_response(txt, emotion, templates_found)
    return {"emotion": emotion, "response": final, "templates": templates_found}

# demo interaction
for msg in ["I am really happy today!", "I don't want to get up", "I'm furious about my boss"]:
    out = empathy_pipeline(msg)
    print("USER:", msg)
    print("EMOTION:", out['emotion'])
    print("BOT:", out['response'])
    print("---")


USER: I am really happy today!
EMOTION: joy
BOT: That's wonderful! What do you think made you feel this way?

I’m not a therapist; please seek professional help for serious issues. If you're in immediate danger, contact local emergency services.
---
USER: I don't want to get up
EMOTION: anger
BOT: I'm really sorry you're feeling down. Do you want to tell me more about what's happening?

I’m not a therapist; please seek professional help for serious issues. If you're in immediate danger, contact local emergency services.
---
USER: I'm furious about my boss
EMOTION: anger
BOT: I can hear how angry that made you feel. Would you like to vent about it?

I’m not a therapist; please seek professional help for serious issues. If you're in immediate danger, contact local emergency services.
---


In [None]:
# Cell 9 - Test multiple messages without input()
messages = [
    "I am happy today!",
    "I feel so lonely...",
    "Life is really hard right now",
    "I just got a promotion!",
    "I'm very angry!"
]

for msg in messages:
    out = empathy_pipeline(msg)
    print("You:", msg)
    print("Detected emotion:", out['emotion'])
    print("Bot:", out['response'])
    print("-" * 50)


You: I am happy today!
Detected emotion: joy
Bot: That's wonderful! What do you think made you feel this way?

I’m not a therapist; please seek professional help for serious issues. If you're in immediate danger, contact local emergency services.
--------------------------------------------------
You: I feel so lonely...
Detected emotion: sadness
Bot: That sounds really tough. I'm here to listen if you want to talk.

I’m not a therapist; please seek professional help for serious issues. If you're in immediate danger, contact local emergency services.
--------------------------------------------------
You: Life is really hard right now
Detected emotion: joy
Bot: That sounds really tough. I'm here to listen if you want to talk.

I’m not a therapist; please seek professional help for serious issues. If you're in immediate danger, contact local emergency services.
--------------------------------------------------
You: I just got a promotion!
Detected emotion: joy
Bot: I’m so glad to hear 

In [32]:
# Cell 10 - Save necessary artifacts to disk: corpus.json (already saved), and small metadata
Path("artifacts").mkdir(exist_ok=True)
# save a minimal index (templates + emotions)
with open("artifacts/corpus.json","w",encoding="utf-8") as f:
    json.dump(corpus, f, ensure_ascii=False, indent=2)
print("Saved artifacts/corpus.json")


Saved artifacts/corpus.json


In [33]:
!pip install pyngrok
from pyngrok import ngrok
import threading, os

def run_app():
    os.system("streamlit run app.py --server.port 8501")

thread = threading.Thread(target=run_app)
thread.start()

# ngrok tunnel
public_url = ngrok.connect(8501)
print("Your app is live at:", public_url)


Your app is live at: NgrokTunnel: "https://d171253e6302.ngrok-free.app" -> "http://localhost:8501"
