In [1]:
import apex
from flair.data import TaggedCorpus 
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, CharacterEmbeddings
from typing import List

def train(data_folder, model_output_folder):
    
    corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path=data_folder)

    # 2. what tag do we want to predict?
    tag_type = 'ner'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward')
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

    # 5. initialize sequence tagger
    from flair.models import SequenceTagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)
    # 6. initialize trainer
    from flair.trainers import ModelTrainer

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # 7. start training
    trainer.train(model_output_folder, 
                  mini_batch_size=256,
                  max_epochs=150)

    # 8. plot training curves (optional)
    from flair.visual.training_curves import Plotter
    plotter = Plotter()
    plotter.plot_training_curves(model_output_folder + '/loss.tsv')
    plotter.plot_weights(model_output_folder + '/weights.txt')


In [None]:

# this is the folder in which train, test and dev files reside
data_folder = '/home/vtssogari/project/final_corps'
model_output_folder = '/home/vtssogari/project/medicine-ner'
train(data_folder, model_output_folder)

2019-03-04 17:47:12,485 Reading data from /home/vtssogari/project/final_corps/conll_03
2019-03-04 17:47:12,486 Train: /home/vtssogari/project/final_corps/conll_03/eng.train
2019-03-04 17:47:12,486 Dev: /home/vtssogari/project/final_corps/conll_03/eng.testa
2019-03-04 17:47:12,486 Test: /home/vtssogari/project/final_corps/conll_03/eng.testb
[b'<unk>', b'O', b'S-MEDICINE', b'B-ORG', b'I-ORG', b'E-ORG', b'S-ORG', b'S-CARDINAL', b'B-CARDINAL', b'E-CARDINAL', b'S-PERSON', b'I-CARDINAL', b'B-LAW', b'I-LAW', b'E-LAW', b'S-NORP', b'B-PERSON', b'I-PERSON', b'E-PERSON', b'B-WORK_OF_ART', b'I-WORK_OF_ART', b'E-WORK_OF_ART', b'B-NORP', b'I-NORP', b'E-NORP', b'S-GPE', b'B-QUANTITY', b'I-QUANTITY', b'E-QUANTITY', b'B-DATE', b'I-DATE', b'E-DATE', b'S-PRODUCT', b'S-DATE', b'B-PERCENT', b'I-PERCENT', b'E-PERCENT', b'S-ORDINAL', b'B-MEDICINE', b'I-MEDICINE', b'E-MEDICINE', b'B-GPE', b'E-GPE', b'B-LOC', b'E-LOC', b'I-GPE', b'B-TIME', b'I-TIME', b'E-TIME', b'S-LOC', b'I-LOC', b'B-FAC', b'E-FAC', b'B-PRODU

In [None]:
# Testing the model
#%%
from flair.data import Sentence
from flair.models import SequenceTagger

# make a sentence
sentence = Sentence('I love Berlin . another cd medicine is not so effective')

# load the NER tagger
tagger = SequenceTagger.load_from_file(model_output_folder + '/final-model.pt')

# run NER over sentence
tagger.predict(sentence)
print(sentence)
print('The following NER tags are found:')

# iterate over entities and print
for entity in sentence.get_spans('ner'):
    print(entity)