# **Create Corpus**

In [0]:
pip install flair

In [13]:
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus

##Note: You will need to save your split CSV data files in the data_folder path with each file titled appropriately 
#       i.e. train.csv test.csv dev.csv. This is because the corpus initializers will automatically search for the 
#       train, dev, test splits in a folder.

# this is the folder in which train, test and dev files reside
data_folder = 'data'

# column format indicating which columns hold the text and label(s)
column_name_map = {1: "text", 3: "label_topic"}

# load corpus containing training, test and dev data and if CSV has a header, you can skip it
corpus: Corpus = CSVClassificationCorpus(data_folder,
                                         column_name_map,
                                         skip_header=True,
                                         delimiter=',',
)

2019-11-18 00:17:22,412 Reading data from data
2019-11-18 00:17:22,414 Train: data/train.csv
2019-11-18 00:17:22,415 Dev: data/dev.csv
2019-11-18 00:17:22,416 Test: data/test.csv


# **Train Model**

In [14]:
from flair.data import Corpus
from flair.datasets import IMDB
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

# 1. get the corpus
#corpus: Corpus = IMDB().downsample(0.1)
print(corpus)

Corpus: 6000 train + 2000 dev + 2000 test sentences


In [15]:
# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

2019-11-18 00:18:13,765 Computing label dictionary. Progress:


100%|██████████| 6000/6000 [00:19<00:00, 300.55it/s]

2019-11-18 00:18:33,879 [b'1', b'0']





In [16]:
# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('glove'),

                   # comment in flair embeddings for state-of-the-art results
                   # FlairEmbeddings('news-forward'),
                   # FlairEmbeddings('news-backward'),
                   ]

2019-11-18 00:18:43,102 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpsahmaka1


100%|██████████| 160000128/160000128 [00:10<00:00, 15001658.61B/s]

2019-11-18 00:18:54,461 copying /tmp/tmpsahmaka1 to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2019-11-18 00:18:54,687 removing temp file /tmp/tmpsahmaka1
2019-11-18 00:18:55,316 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /tmp/tmp0kz75q9c


100%|██████████| 21494764/21494764 [00:02<00:00, 9315156.17B/s]

2019-11-18 00:18:58,324 copying /tmp/tmp0kz75q9c to cache at /root/.flair/embeddings/glove.gensim
2019-11-18 00:18:58,347 removing temp file /tmp/tmp0kz75q9c



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# 4. initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                                     hidden_size=512,
                                                                     reproject_words=True,
                                                                     reproject_words_dimension=256,
                                                                     )

In [0]:
# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

In [0]:
# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

In [0]:
# 7. start the training
  # 'resources/taggers/ag-news' is the location where you want to save your model files.
trainer.train('resources/taggers/ag-news',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=10)

# **Test:**

In [23]:
from flair.data import Sentence
from flair.models import TextClassifier

classifier = TextClassifier.load('resources/taggers/ag-news/final-model.pt')

# create example sentence
sentence = Sentence('France is the current world cup winner.')

# predict class and print
classifier.predict(sentence)

print(sentence.labels)

2019-11-18 00:39:43,165 loading file resources/taggers/ag-news/final-model.pt
[1 (0.7662146687507629)]
