In [None]:
# Step 1: Load & Clean Data
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
file_path = 'ai-medical-chatbot.csv'
df = pd.read_csv(file_path)

In [None]:
# Drop rows with missing values
df.dropna(subset=["Description", "Patient", "Doctor"], inplace=True)

In [None]:
# Create combined input and preserve original output
df["RawInput"] = df["Description"].astype(str) + " " + df["Patient"].astype(str)
df["Output"] = df["Doctor"].astype(str)

In [None]:
# Clean input only
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# Apply cleaning and stopword removal on input only
stop_words = set(stopwords.words('english'))
df["CleanInput"] = df["RawInput"].apply(lambda x: ' '.join([word for word in clean_text(x).split() if word not in stop_words]))

In [None]:
# Final cleaned dataframe
df_cleaned = df[["CleanInput", "Output"]]
df_cleaned.to_csv("cleaned_docbot_dataset.csv", index=False)
print("Cleaned data saved to 'cleaned_docbot_dataset.csv'")

In [None]:
# Load the cleaned data again
df = pd.read_csv("cleaned_docbot_dataset.csv")

In [None]:
# Fill missing (if any)
df['CleanInput'] = df['CleanInput'].astype(str).fillna('')
df['Output'] = df['Output'].astype(str).fillna('')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

In [None]:
# Vectorize inputs
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['CleanInput'])

joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
print("TF-IDF vectorizer model saved as 'tfidf_vectorizer.joblib'")

In [None]:
def get_response(user_input):
    user_input_cleaned = ' '.join([
        word for word in clean_text(user_input).split()
        if word not in stop_words
    ])
    user_vec = vectorizer.transform([user_input_cleaned])
    sim_scores = cosine_similarity(user_vec, X)
    best_match = sim_scores.argmax()
    base_response = df_cleaned.iloc[best_match]['Output']
    
    # Add medicine suggestion based on keyword
    lower_input = user_input.lower()

    if "fever" in lower_input:
        base_response += " Recommended: Paracetamol 500mg every 6-8 hours if needed."
    elif "headache" in lower_input:
        base_response += " Try: Crocin or mild ibuprofen (only after food)."
    elif "cold" in lower_input:
        base_response += " Suggested: Cetrizine or Levocetirizine at night."
    elif "dandruff" in lower_input:
        base_response += " Use: Nizoral 2% shampoo twice weekly, Ketoconazole lotion."
    elif "acne" in lower_input:
        base_response += " Suggested meds: Clindamycin gel (topical), Doxycycline (if prescribed)."
    elif "cough" in lower_input:
        base_response += " Recommended: Benadryl or Ascoril (only under supervision)."

    return base_response

In [None]:
# Step 4: Chat Loop
print("Doc-Bot Ready! Type 'exit' to quit.")
while True:
    query = input("You: ")
    if query.lower() == "exit":
        break
    response = get_response(query)
    print("Doc-Bot:", response)

In [None]:
# Step 5: Download
from IPython.display import FileLink, display
display(FileLink("cleaned_docbot_dataset.csv"))
display(FileLink("tfidf_vectorizer.joblib"))