In [85]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import pandas as pd
from xml.dom import minidom
from sklearn.model_selection import train_test_split

In [86]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [93]:
data_directory = "./data/pan21-author-profiling-training-2021-03-14/en" 

corpus_list = []
with open(data_directory+"/truth.txt") as f:
    for l in f.readlines():
        l = l.rstrip().split(":::")
        user_id = l[0]
        label = int(l[1])
        xml_file = minidom.parse(data_directory+"/"+user_id+".xml")
        document_list = xml_file.getElementsByTagName('document')
        for i, doc in enumerate(document_list):
            corpus_list.append([user_id+"-"+str(i), doc.firstChild.nodeValue, label])

corpus_df = pd.DataFrame(corpus_list, columns=["identifier", "text", "label"])
print(len(corpus_df), "Total tweets")

corpus_df

40000 Total tweets


Unnamed: 0,identifier,text,label
0,639b8e5e6a527d494c85d8f5704b1a01-0,RT #USER#: Funny how “15 days to slow the spre...,0
1,639b8e5e6a527d494c85d8f5704b1a01-1,RT #USER#: Why did Minneapolis just give Georg...,0
2,639b8e5e6a527d494c85d8f5704b1a01-2,"RT #USER#: To be fair, he has done a lot of un...",0
3,639b8e5e6a527d494c85d8f5704b1a01-3,RT #USER#: President Trump got us the #HASHTAG...,0
4,639b8e5e6a527d494c85d8f5704b1a01-4,RT #USER#: Is the case against former officer ...,0
...,...,...,...
39995,41501686277ace6b5fd7dcfe9284fe1d-195,You got to fuck her like you miss her everytim...,1
39996,41501686277ace6b5fd7dcfe9284fe1d-196,i'm solid. been solid. will forever be solid. ...,1
39997,41501686277ace6b5fd7dcfe9284fe1d-197,You nap town bitches really stepped on .... ta...,1
39998,41501686277ace6b5fd7dcfe9284fe1d-198,REPEAT AFTER ME : I don't owe nobody shit !,1


In [88]:
vocab_size = 20000  
maxlen = 200 
test_split = 0.2
val_split = 0.1

x_train_r, x_test_r, y_train, y_test = train_test_split(corpus_df["text"].tolist(), corpus_df["label"].to_numpy(), \
                                                        test_size=test_split, random_state=42)

tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<OOV>")

tokenizer.fit_on_texts(x_train_r)

x_train, x_test = tokenizer.texts_to_sequences(x_train_r), tokenizer.texts_to_sequences(x_test_r)

print(len(x_train), "Training sequences")
print(len(x_test), "Validation sequences")

x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

32000 Training sequences
8000 Validation sequences


In [89]:
embed_dim = 128  # Embedding size for each token
num_heads = 8  # Number of attention heads
ff_dim = 128  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
model.summary()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Model: "model_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        [(None, 200)]             0         
_________________________________________________________________
token_and_position_embedding (None, 200, 128)          2585600   
_________________________________________________________________
transformer_block_13 (Transf (None, 200, 128)          561024    
_________________________________________________________________
global_average_pooling1d_13  (None, 128)  

In [90]:
history = model.fit(
    x_train, y_train, batch_size=32, epochs=100, validation_split=val_split
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
  3/900 [..............................] - ETA: 55s - loss: 0.6933 - accuracy: 0.3993

KeyboardInterrupt: 