In [3]:
class Corp: 
    def __init__(self, anotations, text):
        self.anotations = anotations
        self.text = text

    def anotations(self):
        return self.anotations

    def set_anotations(self, anotations):
        self.anotations = anotations
    
    def text(self):
        return self.text

    def set_text(self, text):
        self.text = text       

    def description(self):
        result = self.text
        for a in self.anotations:
            result = f"{result}\n{a.description()}"
        return result
    
class Anotation:
    def __init__(self, label, keyword):
        self.label = label
        self.keyword = keyword
    
    def label(self):
        return self.label

    def set_label(self, label):
        self.label = label       

    def keyword(self):
        return self.keyword

    def set_keyword(self, keyword):
        self.keyword = keyword       
        
    def description(self):
        return f"{self.label}: {self.keyword}"
    
from os import listdir
from os.path import isfile, join
        
def processDir(corps_dir):
    corps = []
    files = listdir(corps_dir)
    count = len(files)
    for i, f in enumerate(files):
        p = join(corps_dir, f)
        if(i % 100 == 0):
            print(f"{i+1}/{count}")
        if (isfile(p)):
            c = processFile(p)
            if(c != None):
                corps.append(c)
    return corps

def processFile(textfile):
    if(textfile.endswith(".txt") == True):
        annotationfile = textfile.split(".txt")[0] + ".ann"
        corp = Corp([], '')
        lines = open(annotationfile, "r")
        for line in lines: 
            a = processEntity(line)
            if(a != None):
                corp.anotations.append(a)
        if isfile(textfile):
            txt = open(textfile, "r").read()
            corp.text = txt.replace("\n","", 100).replace("__number__", str(random.randint(1,101)))
        return corp
    else:
        return None
            

import random
      
def processEntity(line):
    if(line.startswith("T")):
        values = line.split("\t")
        if len(values) > 2:
            a = Anotation('','')
            temp = values[1].split(" ")
            a.label = temp[0]
            a.keyword = values[2].replace("\n","", 100)
            return a
    return None


# convert twitter

In [4]:
def twitter(tweet_file, output_dir):
    lines = open(tweet_file, "r")
    for line in lines: 
        values = line.split("\t")
        if(len(values) > 1):
            tweet_id = values[0]
            tweet_text = ''.join(values[1:])
            out = open(join(output_dir, f"{tweet_id}.txt"), "w")
            out.write(tweet_text)
output_dir = "./data/TwiMed/gold_conflated/twitter"
tweet_file = "./data/TwiMed/tweets_ID.txt"
twitter(tweet_file, output_dir)



In [5]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import os
from jsonCorps2conll03 import mkdir

def buildCorp(rowIndex, count, corp, pipeName, nlp, out):
    # use spacy entity ruler to generate the bootstrap corps
    ruler = EntityRuler(nlp)
    patterns = []
    for a in corp.anotations:
        word_list = []
        for w in a.keyword.split(" "):
            word_list.append({"lower": w.lower()})
        patterns.append({"label": a.label, "pattern": word_list})
    ruler.add_patterns(patterns)
        
    nlp.replace_pipe(pipeName, ruler)
    if(rowIndex % 100 ==0):
        print(f"{rowIndex+1}/{count}")
    doc = nlp(corp.text)
    out.write("-DOCSTART- -X- - O\n\n")
    for sent in doc.sents:
        for token in sent:
            # ignore 3rd argument as "-" since alot of library seems to ignore the chunk tag
            if(token.ent_type_):
                out.write(f"{token.orth_} {token.tag_} - {token.ent_iob_}-{token.ent_type_}\n")
            else:
                out.write(f"{token.orth_} {token.pos_} - O\n")
        out.write("\n")
            
def build_conll_03(corps, outfile):
    print(f"writing {outfile} ...")
    ruler_name= 'custom'
    nlp = spacy.load('en')
    ruler = EntityRuler(nlp)
    nlp.add_pipe(ruler, name=ruler_name)
    out = open(outfile, "w")
    # write start tag for conll_03 format
    count = len(corps)
    for i, corp in enumerate(corps):
        buildCorp(i, count, corp, ruler_name, nlp, out)
    out.close()
    return count

def convertTwiMedConll_03(corps, basedir, corpsname):
    mkdir(basedir)
    mkdir (basedir + "/conll_03")
    outputname = basedir + '/conll_03/' + corpsname
    print(outputname)   
    build_conll_03(corps, outputname)
    return outputname


In [7]:
pmc_corps = processDir("./data/TwiMed/gold_conflated/pubmed")
twitter_corps = processDir("./data/TwiMed/gold_conflated/twitter")
corps = twitter_corps + pmc_corps

from sklearn.model_selection import train_test_split
train, test, train_y, test_y = train_test_split((corps),(corps), test_size=0.4, random_state=42)
testa, testb, x, y = train_test_split((test),(test), test_size=0.5, random_state=42)

1/2001
101/2001
201/2001
301/2001
401/2001
501/2001
601/2001
701/2001
801/2001
901/2001
1001/2001
1101/2001
1201/2001
1301/2001
1401/2001
1501/2001
1601/2001
1701/2001
1801/2001
1901/2001
2001/2001
1/1608
101/1608
201/1608
301/1608
401/1608
501/1608
601/1608
701/1608
801/1608
901/1608
1001/1608
1101/1608
1201/1608
1301/1608
1401/1608
1501/1608
1601/1608


In [9]:
print(f"PMC {len(pmc_corps)}")
print(f"Twitter {len(twitter_corps)}")

print(f"Train : {len(train)}")
print(f"Testa : {len(testa)}")
print(f"Testb : {len(testb)}")

PMC 1000
Twitter 607
Train : 964
Testa : 321
Testb : 322


In [10]:
convertTwiMedConll_03(testa, "./twimed_conll_03", "eng.testa")
convertTwiMedConll_03(testb, "./twimed_conll_03", "eng.testb")
convertTwiMedConll_03(train, "./twimed_conll_03", "eng.train")


./twimed_conll_03/conll_03/eng.testa
writing ./twimed_conll_03/conll_03/eng.testa ...
1/321
101/321
201/321
301/321
./twimed_conll_03/conll_03/eng.testb
writing ./twimed_conll_03/conll_03/eng.testb ...
1/322
101/322
201/322
301/322
./twimed_conll_03/conll_03/eng.train
writing ./twimed_conll_03/conll_03/eng.train ...
1/964
101/964
201/964
301/964
401/964
501/964
601/964
701/964
801/964
901/964


'./twimed_conll_03/conll_03/eng.train'

# Train Model

In [None]:
from jsonCorps2conll03 import rmdir
from flairNER import train

# this is the folder in which train, test and dev files reside
conll_03_corps_folder = 'twimed_conll_03'
model_output_folder = 'twimed-ner'
rmdir(conll_03_corps_folder)
train(conll_03_corps_folder, model_output_folder)

2019-03-18 17:56:26,460 Reading data from twimed_conll_03/conll_03
2019-03-18 17:56:26,460 Train: twimed_conll_03/conll_03/eng.train
2019-03-18 17:56:26,460 Dev: twimed_conll_03/conll_03/eng.testa
2019-03-18 17:56:26,461 Test: twimed_conll_03/conll_03/eng.testb
[b'<unk>', b'O', b'S-PERSON', b'S-Drug', b'B-CARDINAL', b'I-CARDINAL', b'E-CARDINAL', b'B-Disease_Symptom', b'E-Disease_Symptom', b'I-Disease_Symptom', b'S-ORG', b'S-Disease_Symptom', b'B-DATE', b'I-DATE', b'E-DATE', b'S-CARDINAL', b'B-ORG', b'I-ORG', b'E-ORG', b'S-NORP', b'B-NORP', b'E-NORP', b'S-MONEY', b'S-GPE', b'S-DATE', b'B-TIME', b'E-TIME', b'B-PERSON', b'E-PERSON', b'B-PERCENT', b'E-PERCENT', b'B-MONEY', b'E-MONEY', b'S-ORDINAL', b'B-GPE', b'E-GPE', b'S-PRODUCT', b'I-PERCENT', b'B-Drug', b'E-Drug', b'I-PERSON', b'S-TIME', b'B-QUANTITY', b'E-QUANTITY', b'I-TIME', b'B-LAW', b'E-LAW', b'B-FAC', b'I-FAC', b'E-FAC', b'B-LOC', b'E-LOC', b'S-LOC', b'I-QUANTITY', b'B-PRODUCT', b'E-PRODUCT', b'I-GPE', b'B-WORK_OF_ART', b'I-WORK_O

## Test the NER model

In [1]:
from flair.data import Sentence
from flair.models import SequenceTagger

model_output_folder = 'twimed-ner'

# make a sentence
sentence = Sentence("""
Previous studies have demonstrated that glucocorticoid hormones, including dexamethasone, induced alterations in intracellular calcium homeostasis in acute lymphoblastic leukemia (ALL) cells. However, the mechanism by which intracellular calcium homeostasis participates in dexamethasone sensitivity and resistance on ALL cells remains elusive. Here, we found that treatment of cells with dexamethasone resulted in increased intracellular calcium concentrations through store-operated calcium entry stimulation, which was curtailed by store-operated calcium channel blockers. We show that BAPTA-AM, an intracellular Ca2+ chelator, synergistically enhances dexamethasone lethality in two human ALL cell lines and in three primary specimens. This effect correlated with the inhibition of the prosurvival kinase ERK1/2 signaling pathway. Chelating intracellular calcium with Bapta-AM or inhibiting ERK1/2 with PD98059 significantly potentiated dexamethasone-induced mitochondrial membrane potential collapse, reactive oxygen species production, cytochrome c release, caspase-3 activity, and cell death. Moreover, we show that thapsigargin elevates intracellular free calcium ion level, and activates ERK1/2 signaling, resulting in the inhibition of dexamethasone-induced ALL cells apoptosis. Together, these results indicate that calcium-related ERK1/2 signaling pathway contributes to protect cells from dexamethasone sensitivity by limiting mitochondrial apoptotic pathway. This report provides a novel resistance pathway underlying the regulatory effect of dexamethasone on ALL cells.
""")
# load the NER tagger
tagger = SequenceTagger.load_from_file(model_output_folder + '/final-model.pt')

def detect(tagger, text):
    print('===============================================')
    sentence = Sentence(text)
    tagger.predict(sentence)
    print(sentence)
    print('--------------------------------')

    # iterate over entities and print
    for entity in sentence.get_spans('ner'):
        print(entity)

    

2019-03-19 13:32:25,154 loading file twimed-ner/final-model.pt


In [2]:
##
detect(tagger, """Starting back on fluoxetine tonight, I did pick up the prescription a few days ago but in the past when I’ve started/upped meds I’ve had to call in sick to work due to side effects. So I waited. Now I have 3 days off to adjust""")


Sentence: "Starting back on fluoxetine tonight, I did pick up the prescription a few days ago but in the past when I’ve started/upped meds I’ve had to call in sick to work due to side effects. So I waited. Now I have 3 days off to adjust" - 46 Tokens
--------------------------------
Drug-span [4]: "fluoxetine"
DATE-span [13,14]: "few days"
DATE-span [42,43]: "3 days"


In [4]:
detect(tagger, """Publication alert: combining CBT with #fluoxetine might be superior to either therapy for adolescents with #depression. Model-based random forest method applied in a study by @HeidiBaya Seibold, T.Hothorn, S.Foster, M.Mohler-Kuo, """)

Sentence: "Publication alert: combining CBT with fluoxetine might be superior to either therapy for adolescents with depression. Model-based random forest method applied in a study by @HeidiBaya Seibold, T.Hothorn, S.Foster, M.Mohler-Kuo," - 30 Tokens
--------------------------------
ORG-span [4]: "CBT"
Drug-span [6]: "fluoxetine"
ORG-span [26,27]: "@HeidiBaya Seibold,"


In [5]:
detect(tagger, """sleepless nights, feeling worthless, lifes trash, this shit aint worth it man, fluoxetine isnt doing nothing. ive come to conclusion life is fucking worthless and i wish everyone the best, fuck this, i cant handle this shit mentally anymore, fuck life,  im done with life.  bye""")

Sentence: "sleepless nights, feeling worthless, lifes trash, this shit aint worth it man, fluoxetine isnt doing nothing. ive come to conclusion life is fucking worthless and i wish everyone the best, fuck this, i cant handle this shit mentally anymore, fuck life, im done with life. bye" - 46 Tokens
--------------------------------
Drug-span [13]: "fluoxetine"


In [6]:
detect(tagger, """i’m i’m terrified of putting anything in my body (that’s why i don’t smoke and rarely drink). i’m even scared of taking tylenol sometimes. antibiotics sound like the end of the world to me. idk. help.""")

Sentence: "i’m i’m terrified of putting anything in my body (that’s why i don’t smoke and rarely drink). i’m even scared of taking tylenol sometimes. antibiotics sound like the end of the world to me. idk. help." - 36 Tokens
--------------------------------
Disease_Symptom-span [14]: "smoke"
Drug-span [23]: "tylenol"
