# **Classifier #1**
Train: IMDB  
Test: Amazon

# **Create Corpus**

In [0]:
pip install flair

In [1]:
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus

##Note: You will need to save your split CSV data files in the data_folder path with each file titled appropriately 
#       i.e. train.csv test.csv dev.csv. This is because the corpus initializers will automatically search for the 
#       train, dev, test splits in a folder.

# this is the folder in which train, test and dev files reside
data_folder = 'data'

# column format indicating which columns hold the text and label(s)
column_name_map = {0: "text", 2: "label_topic"}

# load corpus containing training, test and dev data and if CSV has a header, you can skip it
corpus: Corpus = CSVClassificationCorpus(data_folder,
                                         column_name_map,
                                         skip_header=True,
                                         delimiter=',',
)

2019-11-21 23:28:34,282 Reading data from data
2019-11-21 23:28:34,283 Train: data/train.csv
2019-11-21 23:28:34,284 Dev: data/dev.csv
2019-11-21 23:28:34,285 Test: data/test.csv


# **Train Model**

In [11]:
from flair.data import Corpus
from flair.datasets import IMDB
from flair.embeddings import DocumentRNNEmbeddings, RoBERTaEmbeddings, XLNetEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

# 1. Print the corpus
print(corpus)

Corpus: 6831 train + 1708 dev + 37958 test sentences


In [12]:
# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

2019-11-21 23:41:13,106 Computing label dictionary. Progress:


100%|██████████| 6831/6831 [00:23<00:00, 295.29it/s]

2019-11-21 23:41:36,598 [b'1', b'0']





In [0]:
# Initialize RoBERTa Embeddings
#roberta_embedding = RoBERTaEmbeddings(pretrained_model_name_or_path="roberta-base", layers="0,1,2,3,4,5,6,7,8,9,10,11,12", pooling_operation="first", use_scalar_mix=True)

# Initialize XLNet
#xlnet_embedding = XLNetEmbeddings('xlnet-base-cased')

# 3. make a list of word embeddings
word_embeddings = [XLNetEmbeddings('xlnet-base-cased'), RoBERTaEmbeddings(pretrained_model_name_or_path="roberta-base", layers="0,1,2,3,4,5,6,7,8,9,10,11,12", pooling_operation="first", use_scalar_mix=True)]

#stacked_embeddings = StackedEmbeddings(embeddings=[roberta_embedding, xlnet_embedding])

In [0]:
# 4. initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                                     hidden_size=512,
                                                                     reproject_words=True,
                                                                     reproject_words_dimension=256,
                                                                     )

In [0]:
# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

In [0]:
# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

In [25]:
# 7. start the training
  # 'resources/taggers/sentiment-yelp' is the location where you want to save your model files.
trainer.train('resources/taggers/sentiment-yelp',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=10)

2019-11-21 23:49:23,980 ----------------------------------------------------------------------------------------------------
2019-11-21 23:49:23,988 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): XLNetEmbeddings(
        model=0-xlnet-base-cased
        (model): XLNetModel(
          (word_embedding): Embedding(32000, 768)
          (layer): ModuleList(
            (0): XLNetLayer(
              (rel_attn): XLNetRelativeAttention(
                (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (ff): XLNetFeedForward(
                (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                (layer_1): Linear(in_features=768, out_features=3072, bias=True)
                (layer_2): Linear(in_features=3072, out_features=768, bias=True)
                (dropout): Dropout(p

Token indices sequence length is longer than the specified maximum sequence length for this model (824 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: ignored

# **Test:**

In [0]:
from flair.data import Sentence
from flair.models import TextClassifier

classifier = TextClassifier.load('resources/taggers/sentiment-yelp/final-model.pt')

# create example sentence
sentence = Sentence('France is the current world cup winner.')

# predict class and print
classifier.predict(sentence)

print(sentence.labels)

2019-11-18 00:39:43,165 loading file resources/taggers/ag-news/final-model.pt
[1 (0.7662146687507629)]


# **10-Fold Cross Validation**

In [0]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [0]:
# Read in training and test data
input_data = pd.read_csv('data/TrainingData.csv',encoding = "ISO-8859-1",low_memory=False)

# Get index of "Botometer_Score" column
idx_bot = input_data.columns.get_loc("Botometer_Score")

# Generate list of bot labels for all rows in train and test data
## 1 = Not-Bot, 0 = Bot
bot_labels = list(map(lambda x: 1 if x <= 1 else (0 if x >= 3.5 else np.nan), input_data['Botometer_Score']))

# Insert the new column of bot labels at the position immediately after the "Botometer_Score" column
input_data.insert(idx_bot+1, "Bot_or_Not_Bot", bot_labels, True)

In [0]:
# Filter data to only include rows that are labelled "Bot" or "Not-Bot"
input_data_filtered = input_data[(input_data["Bot_or_Not_Bot"] == 1) | (input_data["Bot_or_Not_Bot"] == 0)]

# Filter data to remove rows with nan values in verified and default_profile columns
input_data_filtered = input_data_filtered[(input_data_filtered["verified"] == True) | (input_data_filtered["verified"] == False)]
input_data_filtered = input_data_filtered[(input_data_filtered["default_profile"] == True) | (input_data_filtered["default_profile"] == False)]

In [0]:
# Split data into train and test sets
test_data, train_data = train_test_split(input_data_filtered, test_size=0.33, stratify=input_data_filtered["Bot_or_Not_Bot"])

# Extract attribute columns into new table.
train_attr = train_data[['Botometer_Score', 'Bot_or_Not_Bot', 'followers_count', 'friends_count', 'listed_count', 'favourites_count', 'verified', 'default_profile']]
test_attr = test_data[['Botometer_Score', 'Bot_or_Not_Bot', 'followers_count', 'friends_count', 'listed_count', 'favourites_count', 'verified', 'default_profile']]

# Extract label column into new table
train_label = train_data[['Bot_or_Not_Bot']]
test_label = test_data[['Bot_or_Not_Bot']]

**Testing:**

In [0]:
X = train_attr.values
Y = train_label.values

actual = np.array(test_label)
actual

In [0]:
clf_DT = tree.DecisionTreeClassifier()
clf_DT = clf_DT.fit(X, Y)

scores = cross_val_score(clf_DT, X, Y, cv=10)
fscore = scores.mean()

predicted = clf_DT.predict(test_attr)
pred = np.array(predicted)
pred

In [0]:
accuracy = accuracy_score(actual, pred) * 100
precision = precision_score(actual, pred) * 100
recall = recall_score(actual, pred) * 100
f1 = f1_score(actual, pred)

print ('Accuracy is {:.4f}'.format(accuracy))
print('Precision is {:.4f}'.format(precision))
print('Recall is {:.4f}'.format(recall))
print('F1 Score is {:.4f}'.format(f1))
print('10-Fold CV Score is {:.4f}'.format(fscore))