<a href="https://www.kaggle.com/code/kasinadhsarma1/medbot?scriptVersionId=240098808" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install transformers datasets accelerate



In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset

In [3]:
# data loading
import pandas as pd
df = pd.read_csv('/kaggle/input/ai-medical-chatbot/ai-medical-chatbot.csv')
df.head()

Unnamed: 0,Description,Patient,Doctor
0,Q. What does abutment of the nerve root mean?,"Hi doctor,I am just wondering what is abutting...",Hi. I have gone through your query with dilige...
1,Q. What should I do to reduce my weight gained...,"Hi doctor, I am a 22-year-old female who was d...",Hi. You have really done well with the hypothy...
2,Q. I have started to get lots of acne on my fa...,Hi doctor! I used to have clear skin but since...,Hi there Acne has multifactorial etiology. Onl...
3,Q. Why do I have uncomfortable feeling between...,"Hello doctor,I am having an uncomfortable feel...",Hello. The popping and discomfort what you fel...
4,Q. My symptoms after intercourse threatns me e...,"Hello doctor,Before two years had sex with a c...",Hello. The HIV test uses a finger prick blood ...


In [4]:
# Create conversation pairs using Patient and Doctor columns.
def create_conversation_pair(row):
    # We use only the patient input and doctor's response.
    input_text = "Patient: " + row["Description"].strip()
    target_text = "Doctor: " + row["Doctor"].strip()
    return {"input_text": input_text, "target_text": target_text}

# Apply the function to each row to create a list of training examples.
conversation_pairs = df.apply(create_conversation_pair, axis=1).tolist()
dataset = Dataset.from_list(conversation_pairs)
print("Number of training examples:", len(dataset))

Number of training examples: 256916


In [5]:
# Tokenization: Concatenate the input and target, separated by a newline.
model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

def tokenize_function(examples):
    # Each example becomes "Patient: ...\nDoctor: ..."
    texts = [inp + "\n" + tgt for inp, tgt in zip(examples["input_text"], examples["target_text"])]
    tokenized = tokenizer(texts, truncation=True, padding="max_length", max_length=512)
    # For causal language modeling, set the labels equal to input_ids.
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


# For example, select 10% of the dataset
small_dataset = dataset.shuffle(seed=42).select(range(int(0.6 * len(dataset))))
tokenized_dataset = small_dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format("torch")
print("Tokenization complete.")

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Map:   0%|          | 0/154149 [00:00<?, ? examples/s]

Tokenization complete.


In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./fine_tuned_medical_chatbot",
    num_train_epochs=1,                # One epoch for a quick run        
    save_total_limit=2,
    per_device_train_batch_size=4,
    evaluation_strategy="no",          # Disable evaluation to save time
    logging_steps=50,
    learning_rate=5e-5,
    weight_decay=0.01,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

print("Starting training...")
trainer.train()
print("Training complete.")

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Starting training...


Step,Training Loss
50,1.3321
100,1.0243
150,0.9579
200,0.9571
250,0.9883
300,0.8991
350,0.8941
400,0.8281
450,0.8632
500,0.8602


In [None]:
trainer.save_model("./fine_tuned_medical_chatbot")
tokenizer.save_pretrained("./fine_tuned_medical_chatbot")
print("Model and tokenizer saved.")

In [None]:
!zip -r fine_tuned_medical_chatbot.zip fine_tuned_medical_chatbot

In [None]:
!pip install gdow

In [None]:
curl --upload-file "/kaggle/working/model_output.zip" "https://transfer.sh/model_output.zip"

In [None]:
# INFERENCE
from transformers import pipeline
# Inference: Load the fine-tuned model using a text-generation pipeline.
chatbot = pipeline(
    "text-generation",
    model="./fine_tuned_medical_chatbot",
    tokenizer="./fine_tuned_medical_chatbot"
)

# Interactive loop to test the chatbot.
print("Chatbot is ready. Type 'exit' or 'quit' to stop.")
while True:
    user_input = input("You: ").strip()
    if user_input.lower() in ["exit", "quit"]:
        break
    # Format the input to indicate patient dialogue, and prompt the model for a doctor's reply.
    formatted_input = "Patient: " + user_input + "\nDoctor:"
    response = chatbot(
        formatted_input,
        max_length=150,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    # Extract doctor's reply by taking the text after "Doctor:" if present.
    generated_text = response[0]['generated_text']
    if "Doctor:" in generated_text:
        doctor_reply = generated_text.split("Doctor:")[-1].strip()
    else:
        doctor_reply = generated_text.strip()
    print("Doctor:", doctor_reply)