In [1]:
import flair.datasets
from pathlib import Path
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, BertEmbeddings,  ELMoEmbeddings, FlairEmbeddings, WordEmbeddings, PooledFlairEmbeddings, StackedEmbeddings, CharacterEmbeddings, CharLMEmbeddings
from flair.models import LanguageModel
from gensim.models.keyedvectors import KeyedVectors
from typing import List
import os

In [2]:
# Disease
EBI_data_folder = '/nfs/gns/literature/Santosh_Tirunagari/NER_Datasets/EBI_standard-IOB'
# define columns
columns = {0: 'text', 1: 'ner'}

In [3]:
EBI = ColumnCorpus(EBI_data_folder, columns, 
                              train_file='train.csv',  test_file='test.csv', dev_file='dev.csv', in_memory=False)



2019-10-07 15:42:52,514 Reading data from /nfs/gns/literature/Santosh_Tirunagari/NER_Datasets/EBI_standard-IOB
2019-10-07 15:42:52,518 Train: /nfs/gns/literature/Santosh_Tirunagari/NER_Datasets/EBI_standard-IOB/train.csv
2019-10-07 15:42:52,518 Dev: /nfs/gns/literature/Santosh_Tirunagari/NER_Datasets/EBI_standard-IOB/dev.csv
2019-10-07 15:42:52,518 Test: /nfs/gns/literature/Santosh_Tirunagari/NER_Datasets/EBI_standard-IOB/test.csv


In [4]:
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = EBI.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

[b'<unk>', b'O', b'B-OG', b'I-OG', b'B-GP', b'I-GP', b'B-DS', b'I-DS', b'<START>', b'<STOP>']


In [5]:
embedding_types: List[TokenEmbeddings] = [
                   WordEmbeddings('glove'),
                   FlairEmbeddings('news-forward'),
                   FlairEmbeddings('news-backward'),
                   CharacterEmbeddings(), 
                   PooledFlairEmbeddings('pubmed-backward', pooling='max'),
                   PooledFlairEmbeddings('pubmed-forward', pooling='max'),
                   ]

In [6]:
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [7]:
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [8]:
# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, EBI)

In [9]:
trainer.train('/nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model/EBI',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150)

2019-10-07 15:50:04,170 ----------------------------------------------------------------------------------------------------
2019-10-07 15:50:04,172 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_3): CharacterEmbeddings(
      (char_embedding): Embedding(275, 25)
      (char_rnn): LSTM(25, 25, bidirectional=True)
    )
    (list_embedding_4): PooledFlairEmbeddings(
      (context_embeddings): FlairEmbed

2019-10-07 16:44:39,804 epoch 4 - iter 270/456 - loss 1.95536621 - samples/sec: 17.60
2019-10-07 16:46:03,993 epoch 4 - iter 315/456 - loss 1.96126956 - samples/sec: 17.43
2019-10-07 16:47:22,083 epoch 4 - iter 360/456 - loss 1.96208022 - samples/sec: 18.96
2019-10-07 16:48:30,893 epoch 4 - iter 405/456 - loss 1.94061827 - samples/sec: 21.88
2019-10-07 16:49:55,354 epoch 4 - iter 450/456 - loss 1.92470867 - samples/sec: 17.40
2019-10-07 16:50:06,612 ----------------------------------------------------------------------------------------------------
2019-10-07 16:50:06,614 EPOCH 4 done: loss 1.9216 - lr 0.1000
2019-10-07 16:52:12,474 DEV : loss 2.140122652053833 - score 0.7964
2019-10-07 16:52:19,074 BAD EPOCHS (no improvement): 0
2019-10-07 16:52:28,158 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-10-07 16:52:31,028 epoch 5 - iter 0/456 - loss 3.16306829 - sample

2019-10-07 18:10:24,547 epoch 10 - iter 45/456 - loss 1.38721107 - samples/sec: 17.32
2019-10-07 18:11:50,862 epoch 10 - iter 90/456 - loss 1.31397347 - samples/sec: 17.11
2019-10-07 18:13:06,030 epoch 10 - iter 135/456 - loss 1.29076505 - samples/sec: 19.64
2019-10-07 18:14:12,069 epoch 10 - iter 180/456 - loss 1.31307249 - samples/sec: 22.61
2019-10-07 18:15:25,149 epoch 10 - iter 225/456 - loss 1.30262106 - samples/sec: 20.34
2019-10-07 18:16:49,430 epoch 10 - iter 270/456 - loss 1.29569068 - samples/sec: 17.47
2019-10-07 18:18:08,659 epoch 10 - iter 315/456 - loss 1.30676623 - samples/sec: 18.65
2019-10-07 18:19:16,457 epoch 10 - iter 360/456 - loss 1.31884042 - samples/sec: 21.85
2019-10-07 18:20:38,821 epoch 10 - iter 405/456 - loss 1.31988329 - samples/sec: 17.93
2019-10-07 18:21:49,043 epoch 10 - iter 450/456 - loss 1.32234167 - samples/sec: 21.21
2019-10-07 18:22:01,158 ----------------------------------------------------------------------------------------------------
2019-10

2019-10-07 19:37:29,538 EPOCH 15 done: loss 1.0957 - lr 0.1000
2019-10-07 19:39:36,689 DEV : loss 2.2867133617401123 - score 0.8134
2019-10-07 19:39:42,621 BAD EPOCHS (no improvement): 2
2019-10-07 19:39:42,629 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-10-07 19:39:46,453 epoch 16 - iter 0/456 - loss 1.47653079 - samples/sec: 661.89
2019-10-07 19:40:53,352 epoch 16 - iter 45/456 - loss 1.25017958 - samples/sec: 22.05
2019-10-07 19:42:01,315 epoch 16 - iter 90/456 - loss 1.15448587 - samples/sec: 21.79
2019-10-07 19:43:30,103 epoch 16 - iter 135/456 - loss 1.13304041 - samples/sec: 16.64
2019-10-07 19:44:39,498 epoch 16 - iter 180/456 - loss 1.15119070 - samples/sec: 21.22
2019-10-07 19:45:51,886 epoch 16 - iter 225/456 - loss 1.13928588 - samples/sec: 20.48
2019-10-07 19:47:02,936 epoch 16 - iter 270/456 - loss 1.12415864 - samples/sec: 20.83
2019-10-07 19:48:2

2019-10-07 21:01:37,030 epoch 21 - iter 270/456 - loss 0.89754135 - samples/sec: 22.01
2019-10-07 21:02:49,257 epoch 21 - iter 315/456 - loss 0.87804291 - samples/sec: 20.50
2019-10-07 21:04:21,017 epoch 21 - iter 360/456 - loss 0.87886730 - samples/sec: 16.03
2019-10-07 21:05:45,088 epoch 21 - iter 405/456 - loss 0.88160300 - samples/sec: 17.55
2019-10-07 21:06:52,267 epoch 21 - iter 450/456 - loss 0.87498086 - samples/sec: 22.12
2019-10-07 21:07:07,671 ----------------------------------------------------------------------------------------------------
2019-10-07 21:07:07,674 EPOCH 21 done: loss 0.8756 - lr 0.0500
2019-10-07 21:09:08,312 DEV : loss 2.1231839656829834 - score 0.825
2019-10-07 21:09:14,415 BAD EPOCHS (no improvement): 3
2019-10-07 21:09:14,421 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-10-07 21:09:17,431 epoch 22 - iter 0/456 - loss 1.07967091 -

train mode resetting embeddings
train mode resetting embeddings
2019-10-07 22:24:23,152 epoch 27 - iter 0/456 - loss 1.25379276 - samples/sec: 1088.40
2019-10-07 22:25:27,366 epoch 27 - iter 45/456 - loss 0.79671194 - samples/sec: 22.96
2019-10-07 22:26:40,854 epoch 27 - iter 90/456 - loss 0.74080349 - samples/sec: 20.29
2019-10-07 22:27:57,916 epoch 27 - iter 135/456 - loss 0.71781085 - samples/sec: 19.21
2019-10-07 22:29:10,321 epoch 27 - iter 180/456 - loss 0.70325023 - samples/sec: 20.58
2019-10-07 22:30:23,606 epoch 27 - iter 225/456 - loss 0.71074245 - samples/sec: 20.17
2019-10-07 22:31:29,232 epoch 27 - iter 270/456 - loss 0.71546454 - samples/sec: 22.62
2019-10-07 22:32:48,180 epoch 27 - iter 315/456 - loss 0.71408668 - samples/sec: 18.69
2019-10-07 22:34:08,255 epoch 27 - iter 360/456 - loss 0.70840241 - samples/sec: 18.54
2019-10-07 22:35:29,317 epoch 27 - iter 405/456 - loss 0.70425840 - samples/sec: 18.11
2019-10-07 22:36:55,327 epoch 27 - iter 450/456 - loss 0.70859727 - 

2019-10-07 23:53:25,613 epoch 32 - iter 450/456 - loss 0.67270741 - samples/sec: 16.54
2019-10-07 23:53:35,613 ----------------------------------------------------------------------------------------------------
2019-10-07 23:53:35,615 EPOCH 32 done: loss 0.6726 - lr 0.0125
2019-10-07 23:55:52,326 DEV : loss 2.1185576915740967 - score 0.8295
2019-10-07 23:55:58,742 BAD EPOCHS (no improvement): 1
2019-10-07 23:55:58,753 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-10-07 23:56:07,267 epoch 33 - iter 0/456 - loss 0.90238845 - samples/sec: 1113.71
2019-10-07 23:57:32,321 epoch 33 - iter 45/456 - loss 0.81601547 - samples/sec: 17.39
2019-10-07 23:59:05,110 epoch 33 - iter 90/456 - loss 0.74550254 - samples/sec: 15.94
2019-10-08 00:00:30,903 epoch 33 - iter 135/456 - loss 0.73542568 - samples/sec: 17.24
2019-10-08 00:01:49,197 epoch 33 - iter 180/456 - loss 0.70774171 

2019-10-08 01:24:51,609 epoch 38 - iter 180/456 - loss 0.67606408 - samples/sec: 16.94
2019-10-08 01:26:11,368 epoch 38 - iter 225/456 - loss 0.68392146 - samples/sec: 18.71
2019-10-08 01:27:35,614 epoch 38 - iter 270/456 - loss 0.67592189 - samples/sec: 17.60
2019-10-08 01:28:56,661 epoch 38 - iter 315/456 - loss 0.67750228 - samples/sec: 18.37
2019-10-08 01:30:22,173 epoch 38 - iter 360/456 - loss 0.66578428 - samples/sec: 17.43
2019-10-08 01:31:53,724 epoch 38 - iter 405/456 - loss 0.65826360 - samples/sec: 16.23
2019-10-08 01:33:13,519 epoch 38 - iter 450/456 - loss 0.65752016 - samples/sec: 18.49
2019-10-08 01:33:22,375 ----------------------------------------------------------------------------------------------------
2019-10-08 01:33:22,377 EPOCH 38 done: loss 0.6557 - lr 0.0125
2019-10-08 01:35:40,910 DEV : loss 2.108098268508911 - score 0.8274
Epoch    37: reducing learning rate of group 0 to 6.2500e-03.
2019-10-08 01:35:48,306 BAD EPOCHS (no improvement): 4
2019-10-08 01:35:4

2019-10-08 02:58:32,258 DEV : loss 2.1796014308929443 - score 0.8283
2019-10-08 02:58:38,806 BAD EPOCHS (no improvement): 1
2019-10-08 02:58:38,813 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-10-08 02:58:43,427 epoch 44 - iter 0/456 - loss 1.15151811 - samples/sec: 535.00
2019-10-08 03:00:05,641 epoch 44 - iter 45/456 - loss 0.67674232 - samples/sec: 17.93
2019-10-08 03:01:19,042 epoch 44 - iter 90/456 - loss 0.64233043 - samples/sec: 20.45
2019-10-08 03:02:36,548 epoch 44 - iter 135/456 - loss 0.64369907 - samples/sec: 19.17
2019-10-08 03:03:54,835 epoch 44 - iter 180/456 - loss 0.63887780 - samples/sec: 18.83
2019-10-08 03:05:15,810 epoch 44 - iter 225/456 - loss 0.64486166 - samples/sec: 18.66
2019-10-08 03:06:42,566 epoch 44 - iter 270/456 - loss 0.62043053 - samples/sec: 17.12
2019-10-08 03:07:58,583 epoch 44 - iter 315/456 - loss 0.62222733 - samples/sec: 

2019-10-08 04:30:11,812 epoch 49 - iter 315/456 - loss 0.62726007 - samples/sec: 16.75
2019-10-08 04:31:36,472 epoch 49 - iter 360/456 - loss 0.61687392 - samples/sec: 17.28
2019-10-08 04:32:59,363 epoch 49 - iter 405/456 - loss 0.62055255 - samples/sec: 17.88
2019-10-08 04:34:29,010 epoch 49 - iter 450/456 - loss 0.62074983 - samples/sec: 16.64
2019-10-08 04:34:38,797 ----------------------------------------------------------------------------------------------------
2019-10-08 04:34:38,800 EPOCH 49 done: loss 0.6188 - lr 0.0016
2019-10-08 04:36:51,704 DEV : loss 2.1806163787841797 - score 0.8285
2019-10-08 04:36:58,622 BAD EPOCHS (no improvement): 3
2019-10-08 04:36:58,627 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-10-08 04:37:02,161 epoch 50 - iter 0/456 - loss 0.62928748 - samples/sec: 917.14
2019-10-08 04:38:17,368 epoch 50 - iter 45/456 - loss 0.72006619 

2019-10-08 05:56:32,090 epoch 55 - iter 0/456 - loss 0.56281543 - samples/sec: 488.84
2019-10-08 05:57:41,854 epoch 55 - iter 45/456 - loss 0.70817811 - samples/sec: 21.25
2019-10-08 05:58:59,887 epoch 55 - iter 90/456 - loss 0.66984853 - samples/sec: 19.02
2019-10-08 06:00:27,548 epoch 55 - iter 135/456 - loss 0.67321863 - samples/sec: 16.84
2019-10-08 06:01:41,997 epoch 55 - iter 180/456 - loss 0.67021270 - samples/sec: 19.77
2019-10-08 06:03:05,804 epoch 55 - iter 225/456 - loss 0.67232278 - samples/sec: 17.68
2019-10-08 06:04:17,446 epoch 55 - iter 270/456 - loss 0.66111263 - samples/sec: 20.68
2019-10-08 06:05:40,231 epoch 55 - iter 315/456 - loss 0.65552532 - samples/sec: 17.82
2019-10-08 06:07:04,163 epoch 55 - iter 360/456 - loss 0.65225200 - samples/sec: 17.89
2019-10-08 06:08:26,785 epoch 55 - iter 405/456 - loss 0.64861587 - samples/sec: 17.88
2019-10-08 06:09:39,003 epoch 55 - iter 450/456 - loss 0.63941417 - samples/sec: 20.55
2019-10-08 06:09:47,996 ----------------------

2019-10-08 07:26:32,481 epoch 60 - iter 450/456 - loss 0.61406680 - samples/sec: 19.93
2019-10-08 07:26:39,941 ----------------------------------------------------------------------------------------------------
2019-10-08 07:26:39,943 EPOCH 60 done: loss 0.6126 - lr 0.0002
2019-10-08 07:28:51,615 DEV : loss 2.1760828495025635 - score 0.828
2019-10-08 07:28:57,577 BAD EPOCHS (no improvement): 2
2019-10-08 07:28:57,584 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2019-10-08 07:29:00,312 epoch 61 - iter 0/456 - loss 1.38991749 - samples/sec: 1225.48
2019-10-08 07:30:08,046 epoch 61 - iter 45/456 - loss 0.68566445 - samples/sec: 22.12
2019-10-08 07:31:18,145 epoch 61 - iter 90/456 - loss 0.67579664 - samples/sec: 20.96
2019-10-08 07:32:39,159 epoch 61 - iter 135/456 - loss 0.65860909 - samples/sec: 18.28
2019-10-08 07:34:05,179 epoch 61 - iter 180/456 - loss 0.64750816 -

{'test_score': 0.8302,
 'dev_score_history': [0.7024,
  0.7586,
  0.7931,
  0.7964,
  0.7928,
  0.775,
  0.7891,
  0.7993,
  0.8086,
  0.8213,
  0.7984,
  0.8084,
  0.8291,
  0.8208,
  0.8134,
  0.8279,
  0.8161,
  0.8308,
  0.8222,
  0.8252,
  0.825,
  0.8294,
  0.8309,
  0.8279,
  0.8276,
  0.8314,
  0.8306,
  0.8282,
  0.8301,
  0.826,
  0.8321,
  0.8295,
  0.8244,
  0.8323,
  0.8295,
  0.8287,
  0.8295,
  0.8274,
  0.8271,
  0.8278,
  0.8259,
  0.8278,
  0.8283,
  0.826,
  0.8296,
  0.8292,
  0.8287,
  0.829,
  0.8285,
  0.8276,
  0.8277,
  0.8274,
  0.8275,
  0.8274,
  0.8276,
  0.828,
  0.8281,
  0.8279,
  0.828,
  0.828,
  0.828,
  0.8281],
 'train_loss_history': [5.715853870437856,
  2.6763975607198582,
  2.177585772124299,
  1.9215936441170542,
  1.7384305758434428,
  1.600583697554835,
  1.5187676390142817,
  1.433124551255452,
  1.376603912300708,
  1.3189236416497774,
  1.2372313434617561,
  1.2379289431530132,
  1.1819464059775335,
  1.1420014627408563,
  1.095679702792774

In [1]:
from flair.models import TextClassifier
from flair.data import Sentence, Token
from flair.models import SequenceTagger

from nltk.tokenize import wordpunct_tokenize

# load the model you trained

flair_model = SequenceTagger.load('/nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model/EBI/best-model.pt')

# create example sentence
# text= 'Conditions for production, and some characteristics, of mycobacterial growth inhibitory factor produced by spleen cells from mice immunized with viable cells of the attenuated H37Ra strain of Mycobacterium tuberculosis.'
text = 'For example, the chemical inhibition of IL4I1 activity may represent a new adjuvant strategy for the treatment of cancer by restoring specific anti-tumor immune responses.'
sentence = Sentence(' '.join(wordpunct_tokenize(text)))
flair_model.predict(sentence)

sentence.get_spans('ner')

# [<GP-span (8): "IL4I1">, <DS-span (20): "cancer">, <DS-span (26): "tumor">]

2019-10-08 12:32:35,022 loading file /nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/multi_bio_ner_model/EBI/best-model.pt


[<GP-span (8): "IL4I1">, <DS-span (20): "cancer">, <DS-span (26): "tumor">]

In [2]:
str(sentence.get_spans('ner')).split(':')[1].replace('"','').replace(']','').replace('>','').strip()

'IL4I1, <DS-span (20)'