# **Classifier #2**
Train: Yelp  
Test: Amazon

Atulya Shetty and Payton Walker

## **Create Corpus:**

In [0]:
pip install flair # must install package if first execution

In [0]:
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus

# set folder containing the corpus data
data_folder = 'data/S1'

# define which columns contain the review text and the sentiment label
column_name_map = {0: "text", 1: "label_topic"}

# create corpus 
corpus: Corpus = CSVClassificationCorpus(data_folder, column_name_map, skip_header=True, delimiter=',',)

## **Build and Train Model:**

In [0]:
from flair.data import Corpus
from flair.datasets import IMDB
from flair.embeddings import DocumentRNNEmbeddings, XLNetEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

# print corpus distribution (train, dev, test)
print(corpus)

In [0]:
# build label dictionary
label_dict = corpus.make_label_dictionary()

In [0]:
# define XLNET word embeddings
word_embeddings = [XLNetEmbeddings('xlnet-base-cased')]

# define a document embedding using the XLNET word embeddings defined above
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256,)

# build a text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

# build the text classifier trainer object
trainer = ModelTrainer(classifier, corpus)

In [0]:
# train the sentiment classifier model.
# 'resources/taggers/model_name' is the location where you want to save your model files.
trainer.train('resources/taggers/sentiment-yelp', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=5)


# The train() function automatically generates performance stats by testing the model on the test.csv file.