<a href="https://colab.research.google.com/github/suyash-rgb/Sunstone-Hackathon_1.0-Rural-Healthcare-AI-Bot/blob/main/FirstAidBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers datasets tokenizers accelerate

In [3]:
import json
from transformers import AutoTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
import torch

In [4]:
# Paths to your files
file1 = '/content/drive/MyDrive/project/firstaid_squad.json'
file2 = '/content/drive/MyDrive/project/output_squad.json'
file3 = '/content/drive/MyDrive/project/output_squad1.json'
output_file = '/content/drive/MyDrive/project/combined_squad.json'

# Load JSONs
with open(file1, 'r') as f:
    data1 = json.load(f)
with open(file2, 'r') as f:
    data2 = json.load(f)
with open(file3, 'r') as f:
    data3 = json.load(f)

# Combine 'data' lists
combined_data = data1['data'] + data2['data'] + data3['data']

# Create new JSON
combined_json = {
    'version': 'combined_v2.0',  # Or match original
    'data': combined_data
}

# Save
with open(output_file, 'w') as f:
    json.dump(combined_json, f, indent=4)

print(f"Combined JSON saved to {output_file}. Total entries: {len(combined_data)}")

Combined JSON saved to /content/drive/MyDrive/project/combined_squad.json. Total entries: 46


In [5]:
corpus_path = '/content/drive/MyDrive/project/final_combined_corpus.txt'
vocab_size = 70000
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]  # BERT tokens

# Initialize tokenizer
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.decoder = decoders.WordPiece()

# Trainer
trainer = trainers.WordPieceTrainer(
    vocab_size=vocab_size,
    special_tokens=special_tokens,
    min_frequency=2
)

# Train
tokenizer.train([corpus_path], trainer=trainer)

# Save
tokenizer_path = '/content/drive/MyDrive/project/my_tokenizer'
tokenizer.save(tokenizer_path)

print("Tokenizer trained and saved!")

Tokenizer trained and saved!


In [6]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=tokenizer_path,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]"
)
tokenizer.save_pretrained('/content/drive/MyDrive/project/my_tokenizer_hf')

('/content/drive/MyDrive/project/my_tokenizer_hf/tokenizer_config.json',
 '/content/drive/MyDrive/project/my_tokenizer_hf/special_tokens_map.json',
 '/content/drive/MyDrive/project/my_tokenizer_hf/tokenizer.json')

In [7]:
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/project/my_tokenizer_hf')

# Load corpus as dataset
dataset = load_dataset('text', data_files={'train': corpus_path})['train']

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=128, padding='max_length')

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/153499 [00:00<?, ? examples/s]

In [8]:
from transformers import BertConfig, BertForMaskedLM

config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=512,
    hidden_size=256,  # Smaller for Colab
    num_hidden_layers=6,
    num_attention_heads=8,
    type_vocab_size=2
)
model = BertForMaskedLM(config)

forgot to import DataCollatorForLanguageModeling

In [9]:
from transformers import DataCollatorForLanguageModeling

pretraining


In [None]:
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # Standard for BERT
)

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/project/pretrained_bert',  # Adjusted to 'my_project' for consistency
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Adjust if OOM
    save_steps=1000,
    save_total_limit=2,
    fp16=True,
    report_to="none"  # Disable W&B logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset
)

trainer.train()
trainer.save_model('/content/drive/MyDrive/project/pretrained_bert')
tokenizer.save_pretrained('/content/drive/MyDrive/project/pretrained_bert')



Step,Training Loss


prepara QA dataset

In [None]:
combined_squad_path = '/content/drive/MyDrive/project/combined_squad.json'

# Load SQuAD dataset
squad_dataset = load_dataset('squad_v2' if '2.0' in combined_json['version'] else 'squad', data_files={'train': combined_squad_path})['train']

def preprocess_squad(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=384,  # SQuAD standard
        truncation="only_second",
        stride=128,  # For long contexts
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples['answers']
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        if len(answer['text']) == 0:  # Handle unanswerable (SQuAD 2.0)
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_char = answer['answer_start'][0]
            end_char = start_char + len(answer['text'][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find token indices
            idx = 0
            while sequence_ids[idx] != 1:  # Skip question tokens
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            start_token = 0
            end_token = 0
            for idx, (start, end) in enumerate(offset):
                if start <= start_char < end:
                    start_token = idx
                if start < end_char <= end:
                    end_token = idx
            start_positions.append(start_token)
            end_positions.append(end_token)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_squad = squad_dataset.map(preprocess_squad, batched=True, remove_columns=squad_dataset.column_names)

fine tuning


In [None]:
model = BertForQuestionAnswering.from_pretrained('/content/drive/MyDrive/project/pretrained_bert')

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/project/finetuned_qa_model',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad
)

trainer.train()
trainer.save_model('/content/drive/MyDrive/project/finetuned_qa_model')
tokenizer.save_pretrained('/content/drive/MyDrive/project/finetuned_qa_model')

In [None]:
!pip install -q fastapi uvicorn pyngrok

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline
import uvicorn
import nest_asyncio

# Apply nest_asyncio to allow running FastAPI in Colab
nest_asyncio.apply()

# Load the fine-tuned model
qa_pipeline = pipeline('question-answering', model='/content/drive/MyDrive/project/finetuned_qa_model', tokenizer='/content/drive/MyDrive/project/finetuned_qa_model')

# Initialize FastAPI app
app = FastAPI(title="Extractive QA API")

# Define request model
class QARequest(BaseModel):
    question: str
    context: str

# Define QA endpoint
@app.post("/qa")
async def answer_question(request: QARequest):
    result = qa_pipeline(question=request.question, context=request.context)
    return {
        "answer": result['answer'],
        "score": result['score'],
        "start": result['start'],
        "end": result['end']
    }

# Run the FastAPI server
def run_server():
    uvicorn.run(app, host="0.0.0.0", port=8000)

# Start server in a background thread
import threading
server_thread = threading.Thread(target=run_server)
server_thread.start()

In [None]:
from pyngrok import ngrok

# Set your ngrok authtoken (replace with yours)
!ngrok authtoken YOUR_AUTH_TOKEN_HERE

# Start ngrok tunnel
public_url = ngrok.connect(8000)
print(f"Public URL: {public_url}")

testing the api

In [None]:
import requests

url = "https://YOUR_NGROK_URL.ngrok.io/qa"  # Replace with your ngrok URL
payload = {
    "question": "What to do for heavy bleeding?",
    "context": "If a person is bleeding heavily, apply pressure with a clean cloth."
}
response = requests.post(url, json=payload)
print(response.json())