In [1]:
!pip install -q pandas numpy scikit-learn matplotlib seaborn nltk spacy transformers datasets sentence-transformers faiss-cpu langdetect openpyxl accelerate torch torchaudio sentencepiece langid gradio librosa soundfile pydub ffmpeg-python

import os
import re
import unicodedata
from pathlib import Path

import pandas as pd
import numpy as np

# ML/NLP imports
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer
import faiss

print("✅ Environment ready and all libraries imported.")



✅ Environment ready and all libraries imported.


In [2]:
DATA_DIR = Path('data')
DATA_DIR.mkdir(exist_ok=True)

def safe_read_csv(path):
    try:
        return pd.read_csv(path)
    except Exception as e:
        print(f"⚠️ Failed to load {path}: {e}")
        return pd.DataFrame()

def safe_read_excel(path):
    try:
        return pd.read_excel(path)
    except Exception as e:
        print(f"⚠️ Failed to load {path}: {e}")
        return pd.DataFrame()

# File paths
DATA_DIR = Path('data')
schemes_df = pd.read_excel("health_schemes.xlsx")
patients = pd.read_csv('patients.csv.txt')
appointments = pd.read_csv('appointments.csv.txt')
vitals = pd.read_csv('vitals.csv.txt')
labs = pd.read_csv('lab_results.csv.txt')
diagnoses = pd.read_csv('diagnoses.csv.txt')
lifestyle = pd.read_csv('lifestyle.csv.txt')
med_history = pd.read_csv('medical_history.csv.txt')
ml_preds = pd.read_csv('ml_predictions.csv.txt')

data = {
    'patients': patients,
    'appointments': appointments,
    'vitals': vitals,
    'labs': labs,
    'diagnoses': diagnoses,
    'lifestyle': lifestyle,
    'med_history': med_history,
    'ml_preds': ml_preds
}

print("✅ Loaded datasets:")
for name, df in data.items():
    print(f"{name}: {df.shape}")


✅ Loaded datasets:
patients: (10, 10)
appointments: (3, 6)
vitals: (3, 10)
labs: (5, 7)
diagnoses: (1, 6)
lifestyle: (10, 7)
med_history: (5, 5)
ml_preds: (5, 7)


In [3]:
def normalize_text(text):
    if pd.isna(text):
        return ''
    text = unicodedata.normalize('NFKC', str(text))
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text

# Clean patient names
patients['first_name'] = patients.get('first_name','').apply(normalize_text)
patients['last_name'] = patients.get('last_name','').apply(normalize_text)
patients.fillna('', inplace=True)
print("✅ Patients dataset cleaned")


✅ Patients dataset cleaned


  patients.fillna('', inplace=True)


In [4]:
# Merge with lifestyle and medical history
patient_profiles = patients.copy()

# Merge lifestyle
if not lifestyle.empty:
    patient_profiles = patient_profiles.merge(lifestyle, on='patient_id', how='left')

# Merge medical history as list
if not med_history.empty:
    mh_summary = med_history.groupby('patient_id')['condition_name'].apply(list).reset_index()
    patient_profiles = patient_profiles.merge(mh_summary, on='patient_id', how='left')

print("✅ Unified patient profiles sample:")
display(patient_profiles.head(2))


✅ Unified patient profiles sample:


Unnamed: 0,patient_id,first_name,last_name,date_of_birth,gender,blood_type,phone_number,email,address,created_at,lifestyle_id,smoking_status,alcohol_consumption,activity_level,diet_type,sleep_hours_per_night,condition_name
0,1,John,Doe,1980-05-15,Male,O+,555-0101,,,,1,Current,Heavy,Sedentary,,5.5,[Hypertension]
1,2,Jane,Smith,1975-08-22,Female,A-,555-0102,,,,2,Never,,Moderate,,7.0,


In [5]:
synthetic_queries = []
for _, row in patients.iterrows():
    name = f"{row.get('first_name','')} {row.get('last_name','')}".strip()
    if not name: continue
    synthetic_queries.append((f"What is {name}'s latest cholesterol level?", 'get_lab_result'))
    synthetic_queries.append((f"Show {name}'s blood pressure readings", 'get_vitals'))
    synthetic_queries.append((f"List diagnoses for {name}", 'get_diagnosis'))
    synthetic_queries.append((f"What is {name}'s cardio risk prediction?", 'get_risk_prediction'))

syn_df = pd.DataFrame(synthetic_queries, columns=['text','intent'])
display(syn_df.head())


Unnamed: 0,text,intent
0,What is John Doe's latest cholesterol level?,get_lab_result
1,Show John Doe's blood pressure readings,get_vitals
2,List diagnoses for John Doe,get_diagnosis
3,What is John Doe's cardio risk prediction?,get_risk_prediction
4,What is Jane Smith's latest cholesterol level?,get_lab_result


In [6]:
def extract_patient_name(text):
    for _, row in patients.iterrows():
        full = f"{row.get('first_name','')} {row.get('last_name','')}".strip()
        if full.lower() in text.lower():
            return full
    return None

def extract_test_name(text):
    tests = ['cholesterol', 'hba1c', 'glucose', 'blood pressure', 'bp']
    for test in tests:
        if test.lower() in text.lower():
            return test
    return None

# Demo
print(extract_patient_name("What is John Doe's cholesterol level?"))
print(extract_test_name("What is John Doe's cholesterol level?"))


John Doe
cholesterol


In [7]:
# FAISS index for lab values
if not labs.empty:
    embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    texts = [f"{r['test_name']}: {r['test_value']} {r['unit']}" for _, r in labs.iterrows()]
    embeddings = embedder.encode(texts, convert_to_numpy=True)
    index_labs = faiss.IndexFlatL2(embeddings.shape[1])
    index_labs.add(embeddings)
    print("✅ FAISS index for labs ready")
else:
    index_labs = None
    print("⚠️ Labs dataset empty, skipping FAISS embedding")




✅ FAISS index for labs ready


In [8]:
import torch
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
chat_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
print("✅ Flan-T5 chat pipeline ready")


✅ Flan-T5 chat pipeline ready


In [9]:
import langid

SUPPORTED_LANGS = {"en": "English", "hi": "Hindi"}
MMS_TTS_CODES = {"en": "eng", "hi": "hin"}

# IndicTrans2 models
en_hi_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
en_hi_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
hi_en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-hi-en")
hi_en_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-hi-en")

def detect_language_safe(text):
    try:
        lid, _ = langid.classify(text)
        return lid if lid in SUPPORTED_LANGS else 'en'
    except:
        return 'en'

def translate_to_english(text):
    lang = detect_language_safe(text)
    if lang == "en": return text, "en"
    inputs = hi_en_tokenizer([text], return_tensors="pt", padding=True, truncation=True)
    outputs = hi_en_model.generate(**inputs, max_new_tokens=512)
    return hi_en_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0], lang

def translate_from_english(text, target_lang):
    if target_lang == "en": return text
    inputs = [f">>hi<< {text}"]
    tok = en_hi_tokenizer(inputs, return_tensors="pt", padding=True, truncation=True)
    out = en_hi_model.generate(**tok, max_new_tokens=512)
    return en_hi_tokenizer.batch_decode(out, skip_special_tokens=True)[0]

print("✅ Translation pipeline ready")




✅ Translation pipeline ready


In [10]:
import json

# Prepare KB entries from schemes
if not schemes_df.empty:
    kb_entries = [
        {"id": i, "text": row["Description"]}
        for i, row in schemes_df.iterrows() if pd.notna(row.get("Description"))
    ]
else:
    kb_entries = []

# Build FAISS index
if kb_entries:
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    texts = [e['text'] for e in kb_entries]
    emb = embed_model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    faiss.normalize_L2(emb)
    idx = faiss.IndexFlatIP(emb.shape[1])
    idx.add(emb)
    index_schemes = idx
    embedder_schemes = embed_model
    with open('kb_meta.json','w') as f:
        json.dump(kb_entries, f)
    print(f"✅ FAISS index built for {len(kb_entries)} govt schemes")
else:
    index_schemes = None
    embedder_schemes = None
    print("⚠️ No schemes to index")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ FAISS index built for 20 govt schemes


In [11]:
GOVT_KEYWORDS = ["scheme", "government", "govt", "योजना", "सरकारी", "आयुष्मान", "health plan"]

def is_govt_query(user_text):
    text_lower = user_text.lower()
    return any(k.lower() in text_lower for k in GOVT_KEYWORDS)

def fetch_govt_scheme(user_text, top_k=3):
    if not index_schemes or not kb_entries:
        return None
    if not is_govt_query(user_text):
        return None

    query_emb = embedder_schemes.encode([user_text], convert_to_numpy=True)
    faiss.normalize_L2(query_emb)
    D, I = index_schemes.search(query_emb, top_k)
    schemes = [kb_entries[i]["text"] for i in I[0] if i < len(kb_entries)]
    if schemes:
        return "Here are some government schemes you may be eligible for:\n" + "\n".join(f"- {s}" for s in schemes)
    return None


In [12]:
def fetch_medical_advice(user_text):
    prompt = f"Provide safe, informative, concise medical advice:\n{user_text}"
    output = chat_pipeline(prompt, max_new_tokens=150)
    return output[0]['generated_text']

def fetch_personalized_response(user_text):
    # Translate to English
    text_en, src_lang = translate_to_english(user_text)

    # Govt schemes
    scheme_resp = fetch_govt_scheme(text_en)

    # Prompt for chat model
    prompt = f"User asked: {text_en}\n"
    if scheme_resp:
        prompt += f"Relevant government schemes:\n{scheme_resp}\n"
    prompt += "Now give a safe, concise medical advice if applicable."

    answer_en = chat_pipeline(prompt, max_new_tokens=200)[0]['generated_text']

    # Translate back
    answer_out = translate_from_english(answer_en, src_lang)
    return answer_out


In [13]:
# ------------------------------
# Cell 15: Open-ended AI inference
# ------------------------------
def infer_final(query, user_lang='auto'):
    """
    Generates AI response for any health-related or consultation query.
    Handles:
    - Language detection and translation
    - Govt scheme suggestions if relevant
    - AI-generated medical advice / consultation
    - Translation back to user language
    """
    # 1️⃣ Detect user language
    lang = detect_language_safe(query) if user_lang == 'auto' else user_lang

    # 2️⃣ Translate to English if needed
    translated_q = query
    if lang != 'en' and use_translator:
        translated_q = translate_to_en(query, src_lang=lang)

    # 3️⃣ Extract patient info if present
    patient = extract_patient_name(translated_q)
    test = extract_test_name(translated_q)

    # 4️⃣ Fetch govt scheme if query is relevant
    scheme_resp = None
    if is_govt_query(translated_q) and 'kb_entries' in globals() and kb_entries:
        scheme_resp = fetch_govt_scheme(translated_q)

    # 5️⃣ Prepare AI prompt for Flan-T5 (or your chat model)
    prompt = f"User asked: {translated_q}\n"
    if scheme_resp:
        prompt += f"Relevant government schemes:\n{scheme_resp}\n"
    prompt += "Provide a safe, polite, and concise medical advice or consultation if applicable."

    # 6️⃣ Generate AI response
    try:
        result_en = chat_pipeline(prompt, max_new_tokens=200)[0]['generated_text']
    except Exception as e:
        result_en = f"Error generating AI response: {e}"

    # 7️⃣ Translate back to user's language if needed
    result_final = result_en
    if lang != 'en' and use_translator:
        result_final = translate_from_en(result_en, lang)

    # 8️⃣ Return all useful info
    return {
        "query_original": query,
        "user_lang": lang,
        "query_translated": translated_q,
        "scheme_suggestion": scheme_resp,
        "answer_english": result_en,
        "answer_final": result_final,
        "patient_name": patient,
        "test_name": test
    }


In [14]:
from scipy.io.wavfile import write as wavwrite
import numpy as np
from transformers import pipeline

device_id = 0 if torch.cuda.is_available() else -1

def get_tts_pipeline_for_lang(lang_2):
    if lang_2 not in MMS_TTS_CODES:
        raise ValueError(f"TTS not configured for: {lang_2}")
    code3 = MMS_TTS_CODES[lang_2]
    model_id = f"facebook/mms-tts-{code3}"
    tts = pipeline("text-to-speech", model=model_id, device=device_id)
    return tts

def synthesize_speech(text, lang_2, out_wav="tts_output.wav"):
    tts = get_tts_pipeline_for_lang(lang_2)
    out = tts(text)
    audio = out["audio"]
    sr = out["sampling_rate"]
    audio_int16 = (audio * 32767).astype(np.int16)
    wavwrite(out_wav, sr, audio_int16)
    return out_wav

# Demo
# path = synthesize_speech("नमस्ते, आप कैसे हैं?", "hi", "hello_hi.wav")
# display(audio(path))


In [15]:
def multilingual_infer(user_text, target_lang=None):
    # Detect language & translate
    text_en, src_lang = translate_to_english(user_text)

    # AI medical advice
    medical_resp = fetch_medical_advice(text_en)

    # Govt schemes if query detected
    scheme_resp = fetch_govt_scheme(text_en) if is_govt_query(text_en) else None

    ans_en = medical_resp
    if scheme_resp:
        ans_en += "\n\nYou may also be eligible for these government schemes:\n" + scheme_resp

    out_lang = target_lang or src_lang
    ans_out = translate_from_english(ans_en, out_lang)
    
    return {
        "src_lang": src_lang,
        "out_lang": out_lang,
        "input_english": text_en,
        "answer_english": ans_en,
        "answer_final": ans_out,
    }


In [16]:
import gradio as gr
from typing import Optional

def gradio_pipeline(input_text: str, input_audio: Optional[str], force_output_lang: Optional[str]):
    text = input_text.strip() if input_text else ""
    target_lang = force_output_lang if force_output_lang in SUPPORTED_LANGS else None

    if not text and input_audio:
        # Optional: implement audio transcription
        text = "[Audio transcription placeholder]"  # Replace with actual transcription function

    if not text:
        return "No input received.", None, None, None, None, None

    result = multilingual_infer(text, target_lang=target_lang)

    # TTS
    try:
        wav_path = synthesize_speech(result["answer_final"], result["out_lang"], out_wav="response_tts.wav")
    except Exception as e:
        print(f"TTS error: {e}")
        wav_path = None

    return (
        result["src_lang"],
        result["out_lang"],
        result["input_english"],
        result["answer_english"],
        result["answer_final"],
        wav_path
    )

# Launch Gradio
demo = gr.Interface(
    fn=gradio_pipeline,
    inputs=[
        gr.Textbox(label="Text input"),
        gr.Audio(sources=["microphone","upload"], type="filepath", label="Or speak/upload audio"),
        gr.Dropdown(choices=list(SUPPORTED_LANGS.keys()), value=None, label="Force output language")
    ],
    outputs=[
        gr.Textbox(label="Detected input language"),
        gr.Textbox(label="Output language"),
        gr.Textbox(label="Input in English"),
        gr.Textbox(label="Answer in English"),
        gr.Textbox(label="Final Answer"),
        gr.Audio(label="TTS Audio")
    ],
    title="Multilingual AI Health + Govt Scheme Chatbot",
    description="Ask medical questions or govt scheme queries in English/Hindi. Outputs text + speech."
)

demo.launch(share=False)


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


