In [2]:
import pandas as pd
import numpy as np
import re
import json

In [3]:
with open('ner_tags.json') as f:
  ner_tags = json.load(f)

In [4]:
ner_tags_lowercase = {}
for key, value in ner_tags.items():
    word_list = []
    for word in value:
        word_list.append(word.lower())
    ner_tags_lowercase[key] = word_list

In [5]:
ner_tags_lowercase

{'MAINDIS': ['asthma'],
 'HELPER': ['masks',
  'spray',
  'rescue',
  'antibody',
  'peakflow',
  'breezhaler',
  'bracelet',
  'gp',
  'hepa',
  'medicine',
  'pulseoximeter',
  'oxymeter',
  'inhalers',
  'mct',
  'pulsar',
  'pulmonologist',
  'monotherapy',
  'seretide',
  'pulmonology',
  'therapy',
  'yoga',
  'merv',
  'drags',
  'hospitalised',
  'ambulanced',
  'infusion',
  'nebulizer'],
 'SYMPTOMS': ['puke',
  'suffocate',
  'gag',
  'flare',
  'choke',
  'inflammation',
  'lyrngitis',
  'concussions',
  'headaches',
  'diabetes',
  'lungs',
  'hips',
  'hairs',
  'teeths',
  'ptsd',
  'rashes',
  'cuts',
  'sniffles',
  'sinuses',
  'eyes',
  'throats',
  'thighs',
  'allergies',
  'seizures',
  'coughs',
  'tears',
  'congestion',
  'costochondritis'],
 'TRIGGER': ['allergins',
  'pets.',
  'dusts',
  'flu',
  'hay-fever',
  'mold',
  'sprints',
  'snow',
  'walks',
  'gastroesophageal-reflux-disease',
  'propanol',
  'deodorant',
  'glutamate',
  'exercise-induced',
  'sh

In [6]:
data = pd.read_csv('Asthma_full.csv')

In [7]:
def get_data(data):
    questions = data['Question'].values
    context = data['Context'].values
    all_texts_combined = np.concatenate([questions, context])
    all_text_df = pd.DataFrame(all_texts_combined)
    all_text_df.dropna(inplace=True)
    all_text_df.drop_duplicates(inplace=True)
    all_text_df.columns = ['texts']
    all_texts_combined = all_text_df['texts'].tolist()
    return all_texts_combined


In [8]:
subset_data = data.head()

In [9]:
text_list = get_data(subset_data)

In [10]:
def word_index(word, sentence):
    for word in re.finditer(word, sentence):
        return [word.start(), word.end()]

def get_entity(sentence):
    all_ents = []
    for key, value in ner_tags_lowercase.items():
        for word in value:
            if word in sentence:
                start, end = word_index(word, sentence)
                all_ents.append([start, end, key])
    return all_ents

def create_ner_formatted_file(sentence):
    # sentence = sentence
    entities_list = get_entity(sentence.lower())
    ner_format_json["annotations"].append([sentence,{"entities":entities_list} ])
    return ner_format_json

In [12]:
ner_format_json = {}
ner_format_json["classes"] = ner_tags_lowercase.keys()
ner_format_json["annotations"] = []

for i in text_list:
    ner_format_json = create_ner_formatted_file(i)

In [13]:
ner_format_json

{'classes': dict_keys(['MAINDIS', 'HELPER', 'SYMPTOMS', 'TRIGGER', 'VULNERABLE', 'MEDICATION', 'ADVICE', 'DIAGNOSE', 'CAUSE']),
 'annotations': [['What are common asthma treatments?',
   {'entities': [[16, 22, 'MAINDIS'], [23, 33, 'ADVICE']]}],
  ['Are inhalers useful to asthma patients?',
   {'entities': [[23, 29, 'MAINDIS'], [4, 12, 'HELPER']]}],
  ['When should I go to see a doctor?', {'entities': []}],
  ['When may I need preventer inhaler?', {'entities': []}],
  ['Can you tell me something about additional treatments?',
   {'entities': [[43, 53, 'ADVICE']]}],
  ['The most common asthma treatments are inhalers. These are considered the best asthma treatments for most people. This is because inhalers help get the medicine to the airways where it’s needed. Almost everyone with asthma has a preventer inhaler and a reliever inhaler.',
   {'entities': [[16, 22, 'MAINDIS'],
     [151, 159, 'HELPER'],
     [38, 46, 'HELPER'],
     [23, 33, 'ADVICE']]}],
  ['Almost everyone with asthma has

In [16]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [17]:
list(ner_tags_lowercase.keys())

['MAINDIS',
 'HELPER',
 'SYMPTOMS',
 'TRIGGER',
 'VULNERABLE',
 'MEDICATION',
 'ADVICE',
 'DIAGNOSE',
 'CAUSE']

In [21]:
spacy.require_gpu()
nlp = spacy.blank("en") # load a new spacy model
db = DocBin()

In [25]:
for text, annot in tqdm(ner_format_json['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

100%|██████████| 10/10 [00:00<00:00, 669.98it/s]

Skipping entity





In [None]:
# skeleton code for the ner part
# to be able to route our question for in-domain or out-domain models, be able to guide it to the correct


# huggingface pipeline stuff to integrate their models for QA, BIOBERT model integration or any model
# skeleton code for the ner part
# GPT3 implementation part,

In [24]:
doc.ents

(asthma, bronchodilators, inflammation, GP, inhalers, therapy)

In [None]:
# https://github.com/amrrs/custom-ner-with-spacy3/blob/main/Custom_NER_with_Spacy3.ipynb