# **Classifier #2**
Train: Yelp  
Test: Amazon

# **Create Corpus**

In [0]:
pip install flair # must install package if first execution

In [1]:
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus

# set folder containing the corpus data
data_folder = 'data/S1'

# define which columns contain the review text and the sentiment label
column_name_map = {0: "text", 1: "label_topic"}

# create corpus 
corpus: Corpus = CSVClassificationCorpus(data_folder, column_name_map, skip_header=True, delimiter=',',)

2019-11-22 02:29:10,792 Reading data from data
2019-11-22 02:29:10,793 Train: data/train.csv
2019-11-22 02:29:10,794 Dev: data/dev.csv
2019-11-22 02:29:10,795 Test: data/test.csv


# **Train Model**

In [0]:
from flair.data import Corpus
from flair.datasets import IMDB
from flair.embeddings import DocumentRNNEmbeddings, XLNetEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

# print corpus distribution (train, dev, test)
print(corpus)

In [3]:
# build label dictionary
label_dict = corpus.make_label_dictionary()

2019-11-22 02:29:20,781 Computing label dictionary. Progress:


100%|██████████| 8000/8000 [00:35<00:00, 225.17it/s]

2019-11-22 02:29:56,465 [b'1', b'0']





In [0]:
# define XLNET word embeddings
word_embeddings = [XLNetEmbeddings('xlnet-base-cased')]

In [0]:
# define a document embedding using the XLNET word embeddings defined above
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256,)

In [0]:
# build a text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

In [0]:
# build the text classifier trainer object
trainer = ModelTrainer(classifier, corpus)

In [8]:
# 7. start the training
  # 'resources/taggers/sentiment-yelp' is the location where you want to save your model files.
trainer.train('resources/taggers/sentiment-yelp',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=5)

2019-11-22 02:30:18,016 ----------------------------------------------------------------------------------------------------
2019-11-22 02:30:18,022 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): XLNetEmbeddings(
        model=0-xlnet-base-cased
        (model): XLNetModel(
          (word_embedding): Embedding(32000, 768)
          (layer): ModuleList(
            (0): XLNetLayer(
              (rel_attn): XLNetRelativeAttention(
                (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (ff): XLNetFeedForward(
                (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                (layer_1): Linear(in_features=768, out_features=3072, bias=True)
                (layer_2): Linear(in_features=3072, out_features=768, bias=True)
                (dropout): Dropout(p

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-22 02:50:15,709 ----------------------------------------------------------------------------------------------------
2019-11-22 02:50:22,220 epoch 2 - iter 0/250 - loss 0.51095915 - samples/sec: 175.25
2019-11-22 02:51:55,317 epoch 2 - iter 25/250 - loss 0.59434005 - samples/sec: 8.82
2019-11-22 02:53:31,992 epoch 2 - iter 50/250 - loss 0.58716238 - samples/sec: 8.48
2019-11-22 02:55:06,705 epoch 2 - iter 75/250 - loss 0.59185657 - samples/sec: 8.63
2019-11-22 02:56:44,651 epoch 2 - iter 100/250 - loss 0.57491104 - samples/sec: 8.36
2019-11-22 02:58:17,451 epoch 2 - iter 125/250 - loss 0.57212733 - samples/sec: 8.82
2019-11-22 02:59:49,118 epoch 2 - iter 150/250 - loss 0.56161584 - samples/sec: 8.99
2019-11-22 03:01:24,078 epoch 2 - iter 175/250 - loss 0.56614225 - samples/sec: 8.62
2019-11-22 03:02:57,164 epoch 2 - iter 200/250 - loss 0.56147406 - samples/sec: 8.80
2019-11-22 03:04:35,712 epoch 2 - iter 225/250 - loss 0.56306811 - samples/sec: 8.34
2019-11-22 03:06:02,678 ----

{'test_score': 0.7114,
 'dev_score_history': [0.7185, 0.7095, 0.7305, 0.782, 0.8035],
 'train_loss_history': [0.6281571127176285,
  0.5601590470075607,
  0.5360901998281479,
  0.5124937475323678,
  0.4897082913517952],
 'dev_loss_history': [tensor(0.5552, device='cuda:0'),
  tensor(0.6023, device='cuda:0'),
  tensor(0.5346, device='cuda:0'),
  tensor(0.4765, device='cuda:0'),
  tensor(0.4672, device='cuda:0')]}