In this notebook we will implement a simple neural network to classify texts.

The dataset we will use is Offensive Language Identification (OLID), where short texts in English are labeled for offensiveness. We focus on subtask A: binary classification of offensiveness.

In [None]:
!wget https://sites.google.com/site/offensevalsharedtask/olid/OLIDv1.0.zip
!unzip OLIDv1.0.zip

In [None]:
import csv
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

data_train = []
labels_train = []

with open("olid-training-v1.0.tsv") as f:
    reader = csv.DictReader(f, delimiter="\t")
    for row in reader:
        words = [word.lower() for word in word_tokenize(row["tweet"])]
        data_train.append(words)
        labels_train.append(row["subtask_a"])

data_test = []
labels_test = []
with open("testset-levela.tsv") as f:
    reader = csv.DictReader(f, delimiter="\t")
    for row in reader:
        words = [word.lower() for word in word_tokenize(row["tweet"])]
        data_test.append(words)

with open("labels-levela.csv") as f:
    reader = csv.DictReader(f, fieldnames=["id", "label"])
    for row in reader:
        labels_test.append(row["label"])


We use Keras' tokenizer only to compute the vocabulary on the training set. Sentences are truncated at 100 tokens and padding is added for shortes sentences.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# transform the sentences into vectors
tokenizer = Tokenizer(filters='', lower=True, split=' ')
tokenizer.fit_on_texts(data_train)
word_index = tokenizer.word_index
X_train = tokenizer.texts_to_matrix(data_train)
X_train = pad_sequences(X_train, 100, padding='post', truncating='post')

# encode the labels
encoder = LabelEncoder()
encoder.fit(labels_train)
y_train = encoder.transform(labels_train)

# vectorize the test set
X_test = tokenizer.texts_to_matrix(data_test)
X_test = pad_sequences(X_test, 100, padding='post', truncating='post')
y_test = encoder.transform(labels_test)


The neural network has a first layer where the embeddings are input. They are then concatenated by the Flatten layer and passed on a smaller fully connected hidden layer. The output layer is one neuron with sigmoid activation for binary classification (offensive/not offensive).

In [None]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten
import tensorflow as tf

# create the models
model = Sequential()
model.add(Embedding(len(word_index)+1, 300, input_shape=(100,)))
model.add(Flatten())
model.add(Dense(50, input_shape=(X_train.shape[1],)))
model.add(Activation('sigmoid'))
model.add(Dropout(0.1))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy",
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  metrics=["accuracy"])
model.summary()

history = model.fit(X_train, y_train,
                        batch_size=16,
                        epochs=10,
                        shuffle=True,
                        validation_split=0.1,
                        verbose=1
                        )

Scikit-learn has useful functions to provide evaluation metrics as precision, recall and F1-score.

In [None]:
from sklearn.metrics import classification_report

pred = [int(x>=0.5) for x in model.predict(X_test)]
print (classification_report(y_test, pred))

Let's try to initialize the weights of the first layer with pre-trained embeddings from GloVe.

In [None]:
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import numpy as np

glove2word2vec("glove.6B.300d.txt", "glove_gensim.6B.300d.txt")
embedding_model=KeyedVectors.load_word2vec_format("glove_gensim.6B.300d.txt",binary=False)

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))

for word, i in word_index.items():
    try:
        embedding_vector = embedding_model[word]
        embedding_matrix[i] = embedding_vector
    except:
        # words not found in embedding index will be all-zeros.
        continue


The embedding layer can be set to be trainable, or the weights can be kept frozen.

In [None]:
model = Sequential()
model.add(Embedding(len(word_index)+1, 300, input_shape=(100,), weights=[embedding_matrix], trainable=True))
model.add(Flatten())
model.add(Dense(50, input_shape=(X_train.shape[1],)))
model.add(Activation('sigmoid'))
model.add(Dropout(0.1))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy",
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  metrics=["accuracy"])
model.summary()

history = model.fit(X_train, y_train,
                        batch_size=16,
                        epochs=10,
                        shuffle=True,
                        validation_split=0.1,
                        verbose=1
                        )

In [None]:
pred = [int(x>=0.5) for x in model.predict(X_test)]
print (classification_report(y_test, pred))