In [1]:
!pip install transformers
!pip install emoji
!pip install wordsegment



In [2]:
import random as python_random
import json
import argparse
import numpy as np

from keras.initializers import Constant
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf

from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from tensorflow.keras.losses import CategoricalCrossentropy

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

import emoji
from wordsegment import load, segment

In [3]:
# Mounting google drive to import files later
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
# Setting random seeds
np.random.seed(1234)
tf.random.set_seed(1234)
python_random.seed(1234)

In [14]:
def create_arg_parser():
    """Function that builds up all the arguments used in this script.
    :return: the parsed arguments
    """
    drive_path = "/content/gdrive/MyDrive/lfd_fp_data/" # File path in drive, can be removed when using as a solo python script.
    parser = argparse.ArgumentParser()

    parser.add_argument('-f') # needed to run in google collab, can be removed outside collab.
    parser.add_argument("-i",
                        "--train_file",
                        default=drive_path + 'train.tsv',
                        type=str,
                        help="Input file to learn from (default train.tsv)"
    )
    parser.add_argument("-d",
                        "--dev_file",
                        type=str,
                        default=drive_path + 'dev.tsv',
                        help="Separate dev set to read in (default dev.tsv)"
    )
    parser.add_argument("-t",
                        "--test_file",
                        type=str,
                        default=drive_path + 'test.tsv',
                        help="If added, use trained model to predict on test set"
    )
    parser.add_argument(
        "-gr",
        "--grid",
        action="store_true",
        help="Use grid_search to determine optimal hyperparameters",
    )
    parser.add_argument(
        "-hy",
        "--hyperparameter",
        action="store_true",
        help="Use the optimal hyperparameters determined prior",
    )
    parser.add_argument(
        "-vec",
        "--vectorizer",
        default="tfidf",
        type=str,
        help="Vectorizer to use (tfidf or count) (default tfidf)",
    )
    parser.add_argument(
        "-ngram",
        "--ngram_range",
        default="1,1",
        type=str,
        help="N-gram range (e.g., '1,1' for unigrams, '1,2' for unigrams and bigrams) (default 1,1)",
    )
    parser.add_argument(
        "-maxdf",
        "--max_df",
        default=1.0,
        type=float,
        help="Maximum document frequency for feature selection (default 1)",
    )
    parser.add_argument(
        "-mindf",
        "--min_df",
        default=0.0,
        type=float,
        help="Minimum document frequency for feature selection (default 1)",
    )
    parser.add_argument(
        "-lem",
        "--lemmatize",
        action="store_true",
        help="Lemmatize input words using NLTK",
    )
    parser.add_argument(
        "-tag",
        "--tagger",
        action="store_true",
        help="Use POS tagging as additional input features",
    )
    # Emoji to textual representation
    parser.add_argument(
        "-dem",
        "--demojize",
        action="store_true",
        help="Demojize the input to rewrite emoji's to their textual representation e.g.,  ❤ -> :heart: ",

    )
    # Emoji to natural language
    parser.add_argument(
        "-demclean",
        "--demojize_clean",
        action="store_true",
        help="Demojize the input to rewrite emoji's to natural language in order to preserve semantic meaning eg., "
             "❤ -> heart",

    )

    parser.add_argument(
        "-seg",
        "--wordsegment",
        action="store_true",
        help="Perform wordsegmentation on hashtags to better detect profanity and other offensive language ",

    )

    args = parser.parse_args()

    return args

In [23]:
def read_corpus(corpus_file):
    """
    Reads the corpus file and gets the documents with labels.
    :param str corpus_file: Path to the corpus file.
    :return: the document
    :return: the labels
    """
    documents = []
    labels = []

    with open(corpus_file, encoding="utf-8") as in_file:
        for line in in_file:
            if args.demojize:
                line = emoji.demojize(line)

            elif args.demojize_clean:
                line = emoji.demojize(line)
                for word in line.split():
                    if word[0] == ":" and word[-1] == ":":
                        line = line.replace(word, " ".join(segment(word)))

            if args.wordsegment:
                for word in line.split():
                    if "#" in word:
                        line = line.replace(word, " ".join(segment(word)))


            documents.append(' '.join(line.split()[:-1]))
            labels.append(line.split()[-1])

    return documents, labels

In [24]:
def create_model(X_train, X_dev, Y_train_bin, Y_dev_bin, X_test, Y_test_bin):
  '''Create and train the LM'''

  # A different LM can be specified here
  lm = "bert-base-uncased"

  tokenizer = AutoTokenizer.from_pretrained(lm)

  model = TFAutoModelForSequenceClassification.from_pretrained(lm, num_labels=2)

  tokens_train = tokenizer(X_train, padding=True, max_length=100, truncation=True, return_tensors="np").data
  tokens_dev = tokenizer(X_dev, padding=True, max_length=100, truncation=True, return_tensors="np").data
  tokens_test = tokenizer(X_test, padding=True, max_length=100, truncation=True, return_tensors="np").data

  loss_function = CategoricalCrossentropy(from_logits=True)
  optim = Adam(learning_rate=5e-5)

  model.compile(loss=loss_function, optimizer=optim, metrics=['accuracy'])
  model.fit(tokens_train, Y_train_bin, verbose=1, epochs=3, batch_size=32, validation_data=(tokens_dev, Y_dev_bin))

  Y_pred = model.predict(tokens_test)["logits"]



  return Y_pred

In [25]:
args = create_arg_parser()

# Read in the data and embeddings
X_train, Y_train = read_corpus(args.train_file)
X_dev, Y_dev = read_corpus(args.dev_file)
X_test, Y_test = read_corpus(args.test_file)

print(X_dev)
print(Y_dev)

# Transform string labels to one-hot encodings
encoder = LabelBinarizer()
Y_train_bin = encoder.fit_transform(Y_train)  # Use encoder.classes_ to find mapping back
Y_dev_bin = encoder.fit_transform(Y_dev)
Y_test_bin = encoder.fit_transform(Y_test)

predictions = create_model(X_train, X_dev, Y_train_bin, Y_dev_bin, X_test, Y_test_bin)



All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


ValueError: ignored

In [None]:
# This is where the test-set scores are calculated as well as
# it creates the confusion matrix for the test set

true_labels = []
for i in Y_test_bin:

  true_labels.append(np.where(i==1)[0][0])

true_preds = []
for p in predictions:
  x = np.argmax(p)

  true_preds.append(x)

print("Accuracy score for test set: {}".format(accuracy_score(true_labels, true_preds)))
print("f1 score for test set macro: {}".format(f1_score(true_labels, true_preds, average='macro')))
print("f1 score for test set micro: {}".format(f1_score(true_labels, true_preds, average='micro')))

# Printing confusion matrix
cf_matrix = confusion_matrix(true_labels, true_preds)
index = ["NOT", "OFF"]
columns  = ["NOT", "OFF"]
cm_df = pd.DataFrame(cf_matrix,columns,index)

fig = plt.figure(figsize=(10,6))
sns.heatmap(cm_df, annot=True, fmt='g')

fig.savefig('matrix_bert_run')