In [2]:
!pip install -r requirements.txt -q

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import kagglehub
import shutil
import os
download_path = os.getcwd()

dataset_path = kagglehub.dataset_download("finalepoch/medical-ner")

destination_path = os.path.join(download_path, "dataset")
shutil.move(dataset_path, destination_path)

print("Dataset moved to:", destination_path)
dataset_path = "dataset/Corona2.json" 
data = pd.read_json(dataset_path)
data.head()

list(data['examples'][0].keys())

data['examples'][0]['content']

data['examples'][0]['annotations'][0]

training_data = [{'text': example['content'],
                  'entities': [(annotation['start'], annotation['end'], annotation['tag_name'].upper())
                               for annotation in example['annotations']]}
                 for example in data['examples']]

training_data[0]['entities']

training_data[0]['text'][563:571]

nlp = spacy.blank("en") 
doc_bin = DocBin()

from spacy.util import filter_spans

for training_example in tqdm(training_data):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.set_ents(filtered_ents)
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")

! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

! python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

nlp_trained_model = spacy.load("model-best")

doc = nlp_trained_model('''
The patient was prescribed Aspirin for their heart condition.
The doctor recommended Ibuprofen to alleviate the patient's headache.
The patient is suffering from diabetes, and they need to take Metformin regularly.
After the surgery, the patient experienced some post-operative complications, including infection.
The patient is currently on a regimen of Lisinopril to manage their high blood pressure.
The antibiotic course for treating the bacterial infection should be completed as prescribed.
The patient's insulin dosage needs to be adjusted to better control their blood sugar levels.
The physician suspects that the patient may have pneumonia and has ordered a chest X-ray.
The patient's cholesterol levels are high, and they have been advised to take Atorvastatin.
The allergy to penicillin was noted in the patient's medical history.
''')

spacy.displacy.render(doc, style="ent", jupyter=True)



In [None]:
import spacy
import fitz  # PyMuPDF for PDF extraction
import docx  # python-docx for DOCX extraction
import pytesseract  # OCR for images
import gradio as gr
from PIL import Image
import os

# Load the trained spaCy model
nlp_trained_model = spacy.load("model-best")  # Update path if needed

# ============================================
# Extract Text from PDFs, DOCX, Images, and Text
# ============================================

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()

def extract_text_from_docx(docx_path):
    """Extract text from a DOCX file."""
    doc = docx.Document(docx_path)
    text = "\n".join([p.text for p in doc.paragraphs])
    return text.strip()

def extract_text_from_image(image_path):
    """Extract text from an image using Tesseract OCR."""
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text.strip()

def extract_text_from_file(file_path):
    """Detect file type and extract text accordingly."""
    ext = file_path.split(".")[-1].lower()
    
    if ext == "pdf":
        return extract_text_from_pdf(file_path)
    elif ext == "docx":
        return extract_text_from_docx(file_path)
    elif ext in ["png", "jpg", "jpeg"]:
        return extract_text_from_image(file_path)
    elif ext == "txt":
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read().strip()
    else:
        return "Unsupported file format."

# ============================================
# Named Entity Recognition (NER) Extraction
# ============================================

def extract_entities(text):
    """Extract named entities using the trained spaCy model."""
    doc = nlp_trained_model(text)
    entities = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
    return entities

# ============================================
# Gradio Interface to Upload Files or Text
# ============================================

def process_input(file=None, text=None):
    """Process input file or text and extract named entities."""
    
    if file:
        extracted_text = extract_text_from_file(file.name)
    elif text:
        extracted_text = text
    else:
        return "No input provided", []

    entities = extract_entities(extracted_text)
    return extracted_text, entities

# Define Gradio Interface
interface = gr.Interface(
    fn=process_input,
    inputs=[
        gr.File(label="Upload a File (PDF, DOCX, Image, or TXT)", optional=True),
        gr.Textbox(label="Or Paste Text Directly", optional=True)
    ],
    outputs=[
        gr.Textbox(label="Extracted Text"),
        gr.JSON(label="Named Entities")
    ],
    title="📑 NER Model: Extract Entities from Text, PDF, DOCX, and Images",
    description="Upload a document (TXT, PDF, DOCX, or Image) or enter text to extract named entities using the trained NER model.",
)

# Run Gradio App
interface.launch()

In [None]:
'/Users/yuvaraj/Desktop/projects/spacy model'

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import HfApi, login

# Replace 'your_token_here' with your actual Hugging Face API token
login(token="")

In [None]:
from huggingface_hub import HfApi

api = HfApi()

repo_id = "yuvarajareddy001/medical_ner_model"  # Change this to your HF repo name

api.upload_folder(
    folder_path="'/Users/yuvaraj/Desktop/projects/spacy model'",  # Path to your trained spaCy model
    repo_id=repo_id,
    repo_type="model"
)