<a href="https://colab.research.google.com/github/ted19b/saki_ss19/blob/oss-saki-ss19-exercice-2/uebung_02/flair_nlp_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Resume NER Part 4: Working with Flair NLP**

---

In this part we will use flair NLP to train a model on our data and evaluate the results. Please make sure you have set up your Google account and uploaded your files to Google drive. This Notebook should run on Google Colab. Let's change the working directory to the Google drive where our training data is, and install flair nlp.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import os
os.chdir("/content/gdrive/My Drive/Saki_2019/data/flair")

In [0]:
# download flair library #
! pip install flair

In [0]:
# imports 
from flair.datasets import Corpus
from flair.data_fetcher import NLPTaskDataFetcher

## make sure this describes your file structure
columns = {0: 'text', 1: 'ner'}

# folder where training and test data are
data_folder = '/content/gdrive/My Drive/Saki_2019/data/flair'

# 1.0 is full data, try a much smaller number like 0.1 to test run the code
downsample = 1 

## your train file name
train_file = 'train_res_bilou.txt'

## your test file name
test_file = 'test_res_bilou.txt'

# 1. get the corpus
corpus: Corpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file=train_file, test_file=test_file, dev_file=None).downsample(downsample)
print(corpus)

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type='ner')
print(tag_dictionary.idx2item)

2019-06-18 16:31:56,408 Reading data from /content/gdrive/My Drive/Saki_2019/data/flair
2019-06-18 16:31:56,410 Train: /content/gdrive/My Drive/Saki_2019/data/flair/train_res_bilou.txt
2019-06-18 16:31:56,411 Dev: None
2019-06-18 16:31:56,420 Test: /content/gdrive/My Drive/Saki_2019/data/flair/test_res_bilou.txt


  train_file, column_format
  test_file, column_format


Corpus: 167449 train + 18606 dev + 48803 test sentences
[b'<unk>', b'O', b'Skills', b'-', b'Degree', b'Companies', b'<START>', b'<STOP>']


In [0]:
from typing import List

# 4. initialize embeddings. Experiment with different embedding types to see what gets the best results
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings,FlairEmbeddings

embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('glove'),
    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings (needs a LONG time to train :-)
    #FlairEmbeddings('news-forward'),
    #FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type='ner',
                                        use_crf=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

## give your model a name and folder of your choice. Your model will be saved there for loading later 
## you can run this notebook many times with different embeddings/params and save the models with different names
model_name = 'resources/taggers/resume-ner'

# 7. start training - you can experiment with batch size if you get memory errors
# how many epochs does it take before the model stops showing improvement? Start with a big number like 150, and stop the code cell
# from running at any time - the framework will persist the best model even if you interrupt training. 
trainer.train(model_name,
              learning_rate=0.1,
              mini_batch_size=32,
              #anneal_with_restarts=True,
              max_epochs=150)

2019-06-18 16:34:25,410 ----------------------------------------------------------------------------------------------------
2019-06-18 16:34:25,412 Evaluation method: MICRO_F1_SCORE
2019-06-18 16:34:25,739 ----------------------------------------------------------------------------------------------------
2019-06-18 16:34:26,196 epoch 1 - iter 0/5233 - loss 2.14313650
2019-06-18 16:35:02,066 epoch 1 - iter 523/5233 - loss 0.40119791
2019-06-18 16:35:38,095 epoch 1 - iter 1046/5233 - loss 0.37913694
2019-06-18 16:36:13,841 epoch 1 - iter 1569/5233 - loss 0.37753708
2019-06-18 16:36:49,658 epoch 1 - iter 2092/5233 - loss 0.37006825
2019-06-18 16:37:25,528 epoch 1 - iter 2615/5233 - loss 0.36868961
2019-06-18 16:38:01,291 epoch 1 - iter 3138/5233 - loss 0.36435227
2019-06-18 16:38:37,123 epoch 1 - iter 3661/5233 - loss 0.36310076
2019-06-18 16:39:13,012 epoch 1 - iter 4184/5233 - loss 0.35998051
2019-06-18 16:39:48,741 epoch 1 - iter 4707/5233 - loss 0.35746186
2019-06-18 16:40:24,503 ep

{'dev_loss_history': [tensor(0.3289, device='cuda:0'),
  tensor(0.3259, device='cuda:0'),
  tensor(0.3213, device='cuda:0'),
  tensor(0.3191, device='cuda:0'),
  tensor(0.3153, device='cuda:0'),
  tensor(0.3122, device='cuda:0'),
  tensor(0.3113, device='cuda:0'),
  tensor(0.3080, device='cuda:0'),
  tensor(0.3106, device='cuda:0'),
  tensor(0.3121, device='cuda:0'),
  tensor(0.3065, device='cuda:0'),
  tensor(0.3078, device='cuda:0'),
  tensor(0.3042, device='cuda:0'),
  tensor(0.3016, device='cuda:0'),
  tensor(0.3008, device='cuda:0'),
  tensor(0.3017, device='cuda:0'),
  tensor(0.3005, device='cuda:0'),
  tensor(0.2983, device='cuda:0'),
  tensor(0.2988, device='cuda:0'),
  tensor(0.2979, device='cuda:0'),
  tensor(0.2978, device='cuda:0'),
  tensor(0.2976, device='cuda:0'),
  tensor(0.2963, device='cuda:0'),
  tensor(0.2972, device='cuda:0'),
  tensor(0.2953, device='cuda:0'),
  tensor(0.2954, device='cuda:0'),
  tensor(0.2957, device='cuda:0'),
  tensor(0.2955, device='cuda:0'),


In [0]:
# 8. plot training curves (optional)
from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_training_curves('resources/taggers/resume-ner/loss.tsv')
plotter.plot_weights('resources/taggers/resume-ner/weights.txt')