In [1]:
import flair.datasets
from pathlib import Path
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, BertEmbeddings,  ELMoEmbeddings, FlairEmbeddings, WordEmbeddings, PooledFlairEmbeddings, StackedEmbeddings, CharacterEmbeddings, CharLMEmbeddings
from flair.models import LanguageModel
from gensim.models.keyedvectors import KeyedVectors
from typing import List
import os

In [2]:
# Disease
EBI_data_folder = '/nfs/gns/literature/Santosh_Tirunagari/NER_Datasets/EBI_standard'
# define columns
columns = {0: 'text', 1: 'ner'}

In [3]:
EBI = ColumnCorpus(EBI_data_folder, columns, 
                              train_file='train.csv',  test_file='test.csv', dev_file='dev.csv', in_memory=False)



2019-09-25 15:16:03,704 Reading data from /nfs/gns/literature/Santosh_Tirunagari/NER_Datasets/EBI_standard
2019-09-25 15:16:03,706 Train: /nfs/gns/literature/Santosh_Tirunagari/NER_Datasets/EBI_standard/train.csv
2019-09-25 15:16:03,707 Dev: /nfs/gns/literature/Santosh_Tirunagari/NER_Datasets/EBI_standard/dev.csv
2019-09-25 15:16:03,709 Test: /nfs/gns/literature/Santosh_Tirunagari/NER_Datasets/EBI_standard/test.csv


In [4]:
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = EBI.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

[b'<unk>', b'O', b'B-DS', b'I-DS', b'B-GP', b'I-GP', b'B-OG', b'I-OG', b'<START>', b'<STOP>']


In [5]:
embedding_types: List[TokenEmbeddings] = [
#                    WordEmbeddings('glove'),
                   FlairEmbeddings('news-forward'),
                   FlairEmbeddings('news-backward'),
                   CharacterEmbeddings(), 
                   PooledFlairEmbeddings('pubmed-backward', pooling='max'),
                   PooledFlairEmbeddings('pubmed-forward', pooling='max'),
                   ]

In [6]:
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [7]:
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [8]:
# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, EBI)

In [9]:
trainer.train('/nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model/EBI',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150)

2019-09-25 15:16:53,548 ----------------------------------------------------------------------------------------------------
2019-09-25 15:16:53,551 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): CharacterEmbeddings(
      (char_embedding): Embedding(275, 25)
      (char_rnn): LSTM(25, 25, bidirectional=True)
    )
    (list_embedding_3): PooledFlairEmbeddings(
      (context_embeddings): FlairEmbeddings(
        (lm): LanguageModel(
          (d

2019-09-25 16:08:58,307 epoch 4 - iter 301/438 - loss 2.25408986 - samples/sec: 17.29
2019-09-25 16:10:14,777 epoch 4 - iter 344/438 - loss 2.22949375 - samples/sec: 18.92
2019-09-25 16:11:22,286 epoch 4 - iter 387/438 - loss 2.22811442 - samples/sec: 21.38
2019-09-25 16:12:25,841 epoch 4 - iter 430/438 - loss 2.21112105 - samples/sec: 22.78
2019-09-25 16:12:37,884 ----------------------------------------------------------------------------------------------------
2019-09-25 16:12:37,887 EPOCH 4 done: loss 2.2148 - lr 0.1000
2019-09-25 16:14:37,556 DEV : loss 3.0276317596435547 - score 0.7665
2019-09-25 16:14:46,259 BAD EPOCHS (no improvement): 1
2019-09-25 16:14:46,266 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-09-25 16:14:49,242 epoch 5 - iter 0/438 - loss 3.61932611 - samples/sec: 1048.31
2019-09-25 16:16:25,550 epoch 5 - iter 43/438 - loss 2.47283072 - samp

2019-09-25 17:28:55,619 epoch 10 - iter 43/438 - loss 1.47930847 - samples/sec: 18.22
2019-09-25 17:30:11,425 epoch 10 - iter 86/438 - loss 1.49135714 - samples/sec: 19.50
2019-09-25 17:31:19,982 epoch 10 - iter 129/438 - loss 1.50276813 - samples/sec: 21.26
2019-09-25 17:32:28,515 epoch 10 - iter 172/438 - loss 1.47838016 - samples/sec: 21.61
2019-09-25 17:33:38,087 epoch 10 - iter 215/438 - loss 1.47564626 - samples/sec: 20.93
2019-09-25 17:34:49,512 epoch 10 - iter 258/438 - loss 1.43922985 - samples/sec: 20.37
2019-09-25 17:36:05,447 epoch 10 - iter 301/438 - loss 1.42130660 - samples/sec: 19.06
2019-09-25 17:37:12,918 epoch 10 - iter 344/438 - loss 1.41981807 - samples/sec: 22.04
2019-09-25 17:38:31,120 epoch 10 - iter 387/438 - loss 1.41103360 - samples/sec: 18.65
2019-09-25 17:39:38,993 epoch 10 - iter 430/438 - loss 1.38934595 - samples/sec: 21.48
2019-09-25 17:39:53,843 ----------------------------------------------------------------------------------------------------
2019-09

2019-09-25 18:52:40,067 EPOCH 15 done: loss 1.2107 - lr 0.0500
2019-09-25 18:54:39,061 DEV : loss 2.2987987995147705 - score 0.8292
2019-09-25 18:54:52,050 BAD EPOCHS (no improvement): 2
2019-09-25 18:54:52,062 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-09-25 18:54:56,236 epoch 16 - iter 0/438 - loss 1.94499135 - samples/sec: 675.81
2019-09-25 18:56:18,064 epoch 16 - iter 43/438 - loss 1.21979290 - samples/sec: 17.78
2019-09-25 18:57:26,668 epoch 16 - iter 86/438 - loss 1.18466965 - samples/sec: 21.05
2019-09-25 18:58:34,256 epoch 16 - iter 129/438 - loss 1.22421847 - samples/sec: 21.24
2019-09-25 18:59:44,307 epoch 16 - iter 172/438 - loss 1.23658377 - samples/sec: 20.43
2019-09-25 19:01:14,915 epoch 16 - iter 215/438 - loss 1.23572632 - samples/sec: 15.68
2019-09-25 19:02:27,451 epoch 16 - iter 258/438 - loss 1.20303097 - samples/sec: 19.83
2019-09-25 19:03:3

2019-09-25 20:15:00,320 epoch 21 - iter 258/438 - loss 1.02274702 - samples/sec: 21.03
2019-09-25 20:16:08,663 epoch 21 - iter 301/438 - loss 1.00546216 - samples/sec: 21.81
2019-09-25 20:17:30,100 epoch 21 - iter 344/438 - loss 0.98371508 - samples/sec: 18.38
2019-09-25 20:18:47,497 epoch 21 - iter 387/438 - loss 0.98195362 - samples/sec: 18.84
2019-09-25 20:20:01,406 epoch 21 - iter 430/438 - loss 0.98315675 - samples/sec: 19.95
2019-09-25 20:20:17,312 ----------------------------------------------------------------------------------------------------
2019-09-25 20:20:17,315 EPOCH 21 done: loss 0.9838 - lr 0.0250
2019-09-25 20:22:16,871 DEV : loss 2.451296806335449 - score 0.8339
2019-09-25 20:22:35,330 BAD EPOCHS (no improvement): 3
2019-09-25 20:22:35,359 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-09-25 20:22:38,197 epoch 22 - iter 0/438 - loss 0.94200081 -

train mode resetting embeddings
train mode resetting embeddings
2019-09-25 21:37:58,095 epoch 27 - iter 0/438 - loss 0.79089594 - samples/sec: 1009.54
2019-09-25 21:39:05,870 epoch 27 - iter 43/438 - loss 1.04685853 - samples/sec: 22.09
2019-09-25 21:40:19,254 epoch 27 - iter 86/438 - loss 1.01887827 - samples/sec: 19.81
2019-09-25 21:41:42,800 epoch 27 - iter 129/438 - loss 0.96019575 - samples/sec: 17.41
2019-09-25 21:42:51,738 epoch 27 - iter 172/438 - loss 0.92116842 - samples/sec: 21.00
2019-09-25 21:44:08,572 epoch 27 - iter 215/438 - loss 0.90596723 - samples/sec: 19.42
2019-09-25 21:45:21,645 epoch 27 - iter 258/438 - loss 0.91188275 - samples/sec: 19.61
2019-09-25 21:46:34,837 epoch 27 - iter 301/438 - loss 0.90375720 - samples/sec: 19.81
2019-09-25 21:47:50,861 epoch 27 - iter 344/438 - loss 0.89792924 - samples/sec: 19.05
2019-09-25 21:48:59,457 epoch 27 - iter 387/438 - loss 0.89026027 - samples/sec: 21.72
2019-09-25 21:50:08,429 epoch 27 - iter 430/438 - loss 0.88761075 - 

2019-09-25 23:03:03,793 epoch 32 - iter 430/438 - loss 0.85413696 - samples/sec: 20.49
2019-09-25 23:03:16,802 ----------------------------------------------------------------------------------------------------
2019-09-25 23:03:16,804 EPOCH 32 done: loss 0.8506 - lr 0.0031
2019-09-25 23:05:14,507 DEV : loss 2.464613437652588 - score 0.8347
2019-09-25 23:05:25,711 BAD EPOCHS (no improvement): 2
2019-09-25 23:05:25,723 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-09-25 23:05:28,374 epoch 33 - iter 0/438 - loss 2.02566242 - samples/sec: 1051.62
2019-09-25 23:06:43,545 epoch 33 - iter 43/438 - loss 0.94816678 - samples/sec: 19.10
2019-09-25 23:07:51,696 epoch 33 - iter 86/438 - loss 0.90165780 - samples/sec: 21.00
2019-09-25 23:08:54,528 epoch 33 - iter 129/438 - loss 0.89307372 - samples/sec: 22.87
2019-09-25 23:10:17,693 epoch 33 - iter 172/438 - loss 0.88710252 -

2019-09-26 00:21:57,922 epoch 38 - iter 172/438 - loss 0.84369486 - samples/sec: 17.71
2019-09-26 00:23:09,373 epoch 38 - iter 215/438 - loss 0.84628299 - samples/sec: 20.19
2019-09-26 00:24:21,709 epoch 38 - iter 258/438 - loss 0.84042099 - samples/sec: 20.52
2019-09-26 00:25:43,057 epoch 38 - iter 301/438 - loss 0.82458043 - samples/sec: 17.82
2019-09-26 00:26:55,704 epoch 38 - iter 344/438 - loss 0.82322571 - samples/sec: 20.34
2019-09-26 00:28:02,215 epoch 38 - iter 387/438 - loss 0.81515263 - samples/sec: 22.68
2019-09-26 00:29:07,165 epoch 38 - iter 430/438 - loss 0.80836436 - samples/sec: 22.87
2019-09-26 00:29:22,465 ----------------------------------------------------------------------------------------------------
2019-09-26 00:29:22,467 EPOCH 38 done: loss 0.8061 - lr 0.0016
2019-09-26 00:31:20,823 DEV : loss 2.4937572479248047 - score 0.8344
Epoch    37: reducing learning rate of group 0 to 7.8125e-04.
2019-09-26 00:31:33,654 BAD EPOCHS (no improvement): 4
2019-09-26 00:31:

2019-09-26 01:44:53,881 DEV : loss 2.5032994747161865 - score 0.8336
2019-09-26 01:45:11,913 BAD EPOCHS (no improvement): 1
2019-09-26 01:45:11,929 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-09-26 01:45:14,825 epoch 44 - iter 0/438 - loss 0.56771880 - samples/sec: 866.10
2019-09-26 01:46:24,589 epoch 44 - iter 43/438 - loss 1.00129300 - samples/sec: 21.02
2019-09-26 01:47:37,620 epoch 44 - iter 86/438 - loss 0.91370464 - samples/sec: 19.82
2019-09-26 01:48:44,679 epoch 44 - iter 129/438 - loss 0.90665658 - samples/sec: 21.82
2019-09-26 01:50:00,354 epoch 44 - iter 172/438 - loss 0.89335512 - samples/sec: 19.42
2019-09-26 01:51:07,214 epoch 44 - iter 215/438 - loss 0.85833415 - samples/sec: 21.71
2019-09-26 01:52:20,479 epoch 44 - iter 258/438 - loss 0.84986608 - samples/sec: 19.61
2019-09-26 01:53:34,345 epoch 44 - iter 301/438 - loss 0.84373600 - samples/sec: 

2019-09-26 03:05:58,282 epoch 49 - iter 301/438 - loss 0.88297278 - samples/sec: 21.09
2019-09-26 03:07:11,121 epoch 49 - iter 344/438 - loss 0.85887914 - samples/sec: 19.96
2019-09-26 03:08:31,858 epoch 49 - iter 387/438 - loss 0.86424530 - samples/sec: 17.92
2019-09-26 03:09:46,516 epoch 49 - iter 430/438 - loss 0.85395986 - samples/sec: 19.89
2019-09-26 03:10:03,104 ----------------------------------------------------------------------------------------------------
2019-09-26 03:10:03,106 EPOCH 49 done: loss 0.8542 - lr 0.0002
2019-09-26 03:12:01,533 DEV : loss 2.498610734939575 - score 0.8347
2019-09-26 03:12:13,543 BAD EPOCHS (no improvement): 3
2019-09-26 03:12:13,563 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-09-26 03:12:16,528 epoch 50 - iter 0/438 - loss 0.88764000 - samples/sec: 954.27
2019-09-26 03:13:22,889 epoch 50 - iter 43/438 - loss 0.97223109 -

{'test_score': 0.8006,
 'dev_score_history': [0.7096,
  0.7675,
  0.8021,
  0.7665,
  0.7982,
  0.7932,
  0.7886,
  0.823,
  0.8261,
  0.8256,
  0.8256,
  0.8238,
  0.8356,
  0.8302,
  0.8292,
  0.8321,
  0.8321,
  0.8387,
  0.8342,
  0.8279,
  0.8339,
  0.832,
  0.8342,
  0.8289,
  0.8349,
  0.8359,
  0.835,
  0.8351,
  0.8345,
  0.8335,
  0.8344,
  0.8347,
  0.8339,
  0.8346,
  0.8342,
  0.8347,
  0.8341,
  0.8344,
  0.8351,
  0.8343,
  0.8338,
  0.8344,
  0.8336,
  0.834,
  0.8339,
  0.8343,
  0.8344,
  0.8355,
  0.8347,
  0.8345],
 'train_loss_history': [6.252826037472242,
  3.132222697342912,
  2.52318527763837,
  2.2148423477939274,
  2.012907232323738,
  1.8805517293002507,
  1.7696477278862914,
  1.5106114340971595,
  1.4301416322793046,
  1.3886819007598101,
  1.3300228848304922,
  1.3098057652036894,
  1.2549194575853,
  1.2104461077413602,
  1.210687918545993,
  1.1786689737207814,
  1.1332555707186869,
  1.0375049700067467,
  1.0205778025056673,
  0.9872307731136339,
  0.98

In [10]:
from flair.models import TextClassifier
from flair.data import Sentence, Token
from flair.models import SequenceTagger

from nltk.tokenize import wordpunct_tokenize

# load the model you trained

# flair_model = SequenceTagger.load('/nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model/EBI/best-model.pt')

# create example sentence
# text= 'Conditions for production, and some characteristics, of mycobacterial growth inhibitory factor produced by spleen cells from mice immunized with viable cells of the attenuated H37Ra strain of Mycobacterium tuberculosis.'
text = 'For example, the chemical inhibition of IL4I1 activity may represent a new adjuvant strategy for the treatment of cancer by restoring specific anti-tumor immune responses.'
sentence = Sentence(' '.join(wordpunct_tokenize(text)))
flair_model.predict(sentence)

sentence.get_spans('ner')

2019-09-26 16:06:59,100 loading file /nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model/EBI/best-model.pt


[<GP-span (8): "IL4I1">, <DS-span (20): "cancer">, <DS-span (26): "tumor">]

In [None]:
str(sentence.get_spans('ner')).split(':')[1].replace('"','').replace(']','').replace('>','').strip()