In [None]:
import flair.datasets
from pathlib import Path
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, BertEmbeddings,  ELMoEmbeddings, FlairEmbeddings, WordEmbeddings, PooledFlairEmbeddings, StackedEmbeddings, CharacterEmbeddings, CharLMEmbeddings
from flair.models import LanguageModel
from gensim.models.keyedvectors import KeyedVectors
from typing import List
import os

In [None]:
# Disease
EBI_data_folder = '/nfs/gns/literature/Santosh_Tirunagari/NER_Datasets/EBI_standard-IOB'
# define columns
columns = {0: 'text', 1: 'ner'}

In [None]:
EBI = ColumnCorpus(EBI_data_folder, columns, 
                              train_file='train.csv',  test_file='test.csv', dev_file='dev.csv', in_memory=False)



In [None]:
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = EBI.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

In [None]:
embedding_types: List[TokenEmbeddings] = [
                   WordEmbeddings('glove'),
                   FlairEmbeddings('news-forward'),
                   FlairEmbeddings('news-backward'),
                   CharacterEmbeddings(), 
                   PooledFlairEmbeddings('pubmed-backward', pooling='max'),
                   PooledFlairEmbeddings('pubmed-forward', pooling='max'),
                   ]

In [None]:
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, EBI)

In [None]:
trainer.train('/nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model/EBI',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150)

In [1]:
from flair.models import TextClassifier
from flair.data import Sentence, Token
from flair.models import SequenceTagger

from nltk.tokenize import wordpunct_tokenize

# load the model you trained

flair_model = SequenceTagger.load('/nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model/EBI/best-model.pt')

# create example sentence
# text= 'Conditions for production, and some characteristics, of mycobacterial growth inhibitory factor produced by spleen cells from mice immunized with viable cells of the attenuated H37Ra strain of Mycobacterium tuberculosis.'
text = 'For example, the chemical inhibition of IL4I1 activity may represent a new adjuvant strategy for the treatment of cancer by restoring specific anti-tumor immune responses.'
sentence = Sentence(' '.join(wordpunct_tokenize(text)))
flair_model.predict(sentence)

sentence.to_dict(tag_type='ner')
# [<GP-span (8): "IL4I1">, <DS-span (20): "cancer">, <DS-span (26): "tumor">]

2019-10-10 16:54:47,803 loading file /nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model/EBI/best-model.pt


{'text': 'For example , the chemical inhibition of IL4I1 activity may represent a new adjuvant strategy for the treatment of cancer by restoring specific anti - tumor immune responses .',
 'labels': [],
 'entities': [{'text': 'IL4I1',
   'start_pos': 41,
   'end_pos': 46,
   'type': 'GP',
   'confidence': 0.9873186945915222},
  {'text': 'cancer',
   'start_pos': 115,
   'end_pos': 121,
   'type': 'DS',
   'confidence': 0.9942744374275208},
  {'text': 'tumor',
   'start_pos': 151,
   'end_pos': 156,
   'type': 'DS',
   'confidence': 0.9886742234230042}]}