In [None]:
# Install required libraries
!pip install transformers pandas scikit-learn -q

import pandas as pd
import numpy as np
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset

In [None]:
"""## Phase 1: Data Loading and Preprocessing"""

# Upload dataset
uploaded = files.upload()
df = pd.read_csv(list(uploaded.keys())[0])

# Select relevant columns (all text-based)
df = df[['Symptoms', 'Diagnosis', 'Age', 'Severity']]

# Convert all columns to string type and clean
for col in df.columns:
    df[col] = df[col].astype(str).str.strip().str.lower().replace(r'\s+', ' ', regex=True)

# Handle missing values with empty strings
df.fillna('', inplace=True)

# Create combined text feature (excluding Diagnosis)
df['Combined_Text'] = df['Symptoms'] + " " + df['Age'] + " " + df['Severity']

# Encode Diagnosis labels
le = LabelEncoder()
df['Diagnosis_label'] = le.fit_transform(df['Diagnosis'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['Combined_Text'], df['Diagnosis_label'], test_size=0.2, random_state=42
)

Saving symptoms_vs_diagnosis.csv to symptoms_vs_diagnosis.csv


In [None]:
"""## Phase 2: Model Training - TF-IDF + Logistic Regression"""

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train Logistic Regression
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = model_lr.predict(X_test_tfidf)
print(f"TF-IDF + Logistic Regression Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

TF-IDF + Logistic Regression Accuracy: 0.40

Classification Report:
                 precision    recall  f1-score   support

         asthma       0.12      0.13      0.12       198
      back pain       0.14      0.11      0.12       182
           cold       1.00      1.00      1.00       213
       covid-19       0.16      0.10      0.12       200
     depression       1.00      1.00      1.00       188
            flu       0.13      0.17      0.15       205
gastroenteritis       1.00      1.00      1.00       216
   heart attack       0.10      0.09      0.10       189
       migraine       0.13      0.16      0.14       214
      pneumonia       0.16      0.16      0.16       195

       accuracy                           0.40      2000
      macro avg       0.39      0.39      0.39      2000
   weighted avg       0.40      0.40      0.40      2000



In [None]:

"""## Phase 3: Model Training - Fine-tuned BERT"""

# Initialize BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(le.classes_)
)


In [None]:
# Prepare Dataset
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128)

train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(y_train.values)
)

test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(y_test.values)
)


In [None]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Optimizer and Training Setup
optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


In [None]:
# Training Loop
model.train()
for epoch in range(3):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")
        loss.backward()
        optimizer.step()
   # print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")

In [None]:
# Evaluation
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"\nBERT Accuracy: {correct/total:.2f}")

In [None]:
"""## Phase 4: Interactive Prediction"""

# Create a text input interface
import ipywidgets as widgets

text_input = widgets.Textarea(
    rows=4,
    value="cough, fever, age 35, severity moderate",
    description='Enter Symptoms/Age/Severity:'
)

diagnosis_label = widgets.Label()

def predict_bert(*args):
    text = text_input.value.lower().strip()
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits).item()
    diagnosis_label.value = f"Predicted Diagnosis: {le.inverse_transform([prediction])[0]}"

button = widgets.Button(description='Predict Diagnosis')
button.on_click(predict_bert)

display(text_input)
display(button)
display(diagnosis_label)

In [None]:
"""## Phase 5: Ethical Considerations"""

print("""
Disclaimer:
This Healthcare Diagnostic Assistant provides preliminary suggestions based on text patterns and should not replace professional medical diagnosis.
Always consult with a licensed healthcare professional for accurate diagnosis and treatment.
The system's suggestions are non-binding and intended for informational purposes only.
""")


Disclaimer:
This Healthcare Diagnostic Assistant provides preliminary suggestions based on text patterns and should not replace professional medical diagnosis.
Always consult with a licensed healthcare professional for accurate diagnosis and treatment.
The system's suggestions are non-binding and intended for informational purposes only.

