In [1]:
from transformers import AutoTokenizer

# Define the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Example text
text = "The movie was fantastic!"

# Tokenize the text
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False)
print(inputs.input_ids)


  from .autonotebook import tqdm as notebook_tqdm


tensor([[ 1996,  3185,  2001, 10392,   999]])


In [2]:
import torch.nn as nn
from transformers import AutoConfig

# Load model configuration
config = AutoConfig.from_pretrained('bert-base-uncased')

# Define the embedding layer
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)

# Generate embeddings for the input tokens
input_embs = token_emb(inputs.input_ids)
print(input_embs.shape)




torch.Size([1, 5, 768])


In [3]:
import torch
import torch.nn.functional as F
from math import sqrt

def scaled_dot_product_attention(query, key, value):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value)

query = key = value = input_embs
weighted_value = scaled_dot_product_attention(query, key, value)
print(weighted_value.shape)


torch.Size([1, 5, 768])


In [4]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
        return attn_outputs

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(input_embs)
print(attn_output.size())


torch.Size([1, 5, 768])


In [5]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
        return attn_outputs

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(input_embs)
print(attn_output.size())


torch.Size([1, 5, 768])


In [6]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

feed_forward = FeedForward(config)
ff_outputs = feed_forward(attn_output)
print(ff_outputs.size())


torch.Size([1, 5, 768])


In [7]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def forward(self, x):
        hidden_state = self.layer_norm_1(x)
        x = x + self.attention(hidden_state)
        x = x + self.feed_forward(self.layer_norm_2(x))
        return x

encoder_layer = TransformerEncoderLayer(config)
print(encoder_layer(input_embs).size())


torch.Size([1, 5, 768])


In [8]:
class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout()

    def forward(self, input_ids):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

embedding_layer = Embeddings(config)
print(embedding_layer(inputs.input_ids).size())


torch.Size([1, 5, 768])


In [9]:
class TransformerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList([TransformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, x):
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x)
        return x

encoder = TransformerEncoder(config)
print(encoder(inputs.input_ids).size())


torch.Size([1, 5, 768])


In [10]:
class TransformerForSequenceClassification(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, x):
        x = self.encoder(x)[:, 0, :]
        x = self.dropout(x)
        x = self.classifier(x)
        return x

config.num_labels = 2
encoder_classifier = TransformerForSequenceClassification(config)
print(encoder_classifier(inputs.input_ids).size())


torch.Size([1, 2])


In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from transformers import BertTokenizer, BertConfig

# Define the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example text
text = "The movie was fantastic!"

# Tokenize the text
inputs = tokenizer(text, return_tensors="tf", add_special_tokens=False)
print(inputs['input_ids'])

# Load model configuration
config = BertConfig.from_pretrained('bert-base-uncased')

# Define the embedding layer
class Embeddings(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = layers.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = layers.Embedding(config.max_position_embeddings, config.hidden_size)
        self.layer_norm = layers.LayerNormalization(epsilon=1e-12)
        self.dropout = layers.Dropout(config.hidden_dropout_prob)

    def call(self, input_ids):
        seq_length = tf.shape(input_ids)[1]
        position_ids = tf.range(seq_length)[tf.newaxis, :]
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

# Scaled Dot-Product Attention
def scaled_dot_product_attention(query, key, value, mask=None):
    dim_k = tf.cast(tf.shape(key)[-1], tf.float32)
    scores = tf.matmul(query, key, transpose_b=True) / tf.math.sqrt(dim_k)
    if mask is not None:
        scores += (mask * -1e9)
    weights = tf.nn.softmax(scores, axis=-1)
    return tf.matmul(weights, value)

# Attention Head
class AttentionHead(layers.Layer):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = layers.Dense(head_dim)
        self.k = layers.Dense(head_dim)
        self.v = layers.Dense(head_dim)

    def call(self, hidden_state, mask=None):
        query = self.q(hidden_state)
        key = self.k(hidden_state)
        value = self.v(hidden_state)
        attn_output = scaled_dot_product_attention(query, key, value, mask)
        return attn_output

# Multi-Head Attention
class MultiHeadAttention(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_dim = config.hidden_size // config.num_attention_heads
        self.attention_heads = [AttentionHead(config.hidden_size, self.head_dim) for _ in range(self.num_heads)]
        self.output_linear = layers.Dense(config.hidden_size)

    def call(self, hidden_state, mask=None):
        attn_outputs = [head(hidden_state, mask) for head in self.attention_heads]
        concat_attn = tf.concat(attn_outputs, axis=-1)
        output = self.output_linear(concat_attn)
        return output

# Feed Forward Layer
class FeedForward(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = layers.Dense(config.intermediate_size, activation='gelu')
        self.linear_2 = layers.Dense(config.hidden_size)
        self.dropout = layers.Dropout(config.hidden_dropout_prob)

    def call(self, x):
        x = self.linear_1(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

# Transformer Encoder Layer
class TransformerEncoderLayer(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = layers.LayerNormalization(epsilon=1e-12)
        self.layer_norm_2 = layers.LayerNormalization(epsilon=1e-12)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def call(self, x, mask=None):
        hidden_state = self.layer_norm_1(x)
        attention_output = self.attention(hidden_state, mask)
        x = x + attention_output
        feed_forward_output = self.feed_forward(self.layer_norm_2(x))
        x = x + feed_forward_output
        return x

# Full Transformer Encoder
class TransformerEncoder(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.encoder_layers = [TransformerEncoderLayer(config) for _ in range(config.num_hidden_layers)]

    def call(self, input_ids, mask=None):
        x = self.embeddings(input_ids)
        for layer in self.encoder_layers:
            x = layer(x, mask)
        return x

# Sequence Classification Model
class TransformerForSequenceClassification(tf.keras.Model):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config)
        self.dropout = layers.Dropout(config.hidden_dropout_prob)
        self.classifier = layers.Dense(config.num_labels)

    def call(self, input_ids, mask=None):
        x = self.encoder(input_ids, mask)
        x = self.dropout(x[:, 0, :])
        x = self.classifier(x)
        return x




tf.Tensor([[ 1996  3185  2001 10392   999]], shape=(1, 5), dtype=int32)


ValueError: Exception encountered when calling TransformerForSequenceClassification.call().

[1mOnly input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: None (of type <class 'NoneType'>)[0m

Arguments received by TransformerForSequenceClassification.call():
  • input_ids=tf.Tensor(shape=(1, 5), dtype=int32)
  • mask=None

In [15]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, optimizers, losses
from transformers import BertTokenizer, BertConfig
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the IMDB dataset
imdb = tf.keras.datasets.imdb
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=20000)

# Define the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and pad sequences
def tokenize_and_pad(texts, tokenizer, max_len=128):
    tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=max_len) for text in texts]
    padded_texts = pad_sequences(tokenized_texts, maxlen=max_len, padding='post', truncating='post')
    return padded_texts

# Convert IMDB dataset to text
word_index = imdb.get_word_index()
index_word = {index + 3: word for word, index in word_index.items()}
index_word[0] = '[PAD]'
index_word[1] = '[START]'
index_word[2] = '[UNK]'
index_word[3] = '[UNUSED]'

def decode_review(encoded_review):
    return ' '.join([index_word.get(i, '?') for i in encoded_review])

x_train_texts = [decode_review(x) for x in x_train]
x_test_texts = [decode_review(x) for x in x_test]

# Tokenize and pad sequences
max_len = 512
x_train_padded = tokenize_and_pad(x_train_texts, tokenizer, max_len)
x_test_padded = tokenize_and_pad(x_test_texts, tokenizer, max_len)

# Load model configuration
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = 2




In [16]:
# Define the embedding layer
class Embeddings(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = layers.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = layers.Embedding(config.max_position_embeddings, config.hidden_size)
        self.layer_norm = layers.LayerNormalization(epsilon=1e-12)
        self.dropout = layers.Dropout(config.hidden_dropout_prob)

    def call(self, input_ids):
        seq_length = tf.shape(input_ids)[1]
        position_ids = tf.range(seq_length)[tf.newaxis, :]
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [17]:
# Scaled Dot-Product Attention
def scaled_dot_product_attention(query, key, value, mask=None):
    dim_k = tf.cast(tf.shape(key)[-1], tf.float32)
    scores = tf.matmul(query, key, transpose_b=True) / tf.math.sqrt(dim_k)
    if mask is not None:
        scores += (mask * -1e9)
    weights = tf.nn.softmax(scores, axis=-1)
    return tf.matmul(weights, value)

# Attention Head
class AttentionHead(layers.Layer):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = layers.Dense(head_dim)
        self.k = layers.Dense(head_dim)
        self.v = layers.Dense(head_dim)

    def call(self, hidden_state, mask=None):
        query = self.q(hidden_state)
        key = self.k(hidden_state)
        value = self.v(hidden_state)
        attn_output = scaled_dot_product_attention(query, key, value, mask)
        return attn_output

# Multi-Head Attention
class MultiHeadAttention(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_dim = config.hidden_size // config.num_attention_heads
        self.attention_heads = [AttentionHead(config.hidden_size, self.head_dim) for _ in range(self.num_heads)]
        self.output_linear = layers.Dense(config.hidden_size)

    def call(self, hidden_state, mask=None):
        attn_outputs = [head(hidden_state, mask) for head in self.attention_heads]
        concat_attn = tf.concat(attn_outputs, axis=-1)
        output = self.output_linear(concat_attn)
        return output

# Feed Forward Layer
class FeedForward(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = layers.Dense(config.intermediate_size, activation='gelu')
        self.linear_2 = layers.Dense(config.hidden_size)
        self.dropout = layers.Dropout(config.hidden_dropout_prob)

    def call(self, x):
        x = self.linear_1(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

# Transformer Encoder Layer
class TransformerEncoderLayer(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = layers.LayerNormalization(epsilon=1e-12)
        self.layer_norm_2 = layers.LayerNormalization(epsilon=1e-12)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def call(self, x, mask=None):
        hidden_state = self.layer_norm_1(x)
        attention_output = self.attention(hidden_state, mask)
        x = x + attention_output
        feed_forward_output = self.feed_forward(self.layer_norm_2(x))
        x = x + feed_forward_output
        return x

# Full Transformer Encoder
class TransformerEncoder(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.encoder_layers = [TransformerEncoderLayer(config) for _ in range(config.num_hidden_layers)]

    def call(self, input_ids, mask=None):
        x = self.embeddings(input_ids)
        for layer in self.encoder_layers:
            x = layer(x, mask)
        return x


In [18]:
# Sequence Classification Model
class TransformerForSequenceClassification(tf.keras.Model):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config)
        self.dropout = layers.Dropout(config.hidden_dropout_prob)
        self.classifier = layers.Dense(config.num_labels)

    def call(self, input_ids, mask=None):
        x = self.encoder(input_ids, mask)
        x = self.dropout(x[:, 0, :])
        x = self.classifier(x)
        return x


In [30]:
# Instantiate the model
model = TransformerForSequenceClassification(config)

# Compile the model
optimizer = optimizers.Adam(learning_rate=3e-5)
loss = losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [27]:
x_train_padded

array([[ 101, 1031, 2707, ...,    0,    0,    0],
       [ 101, 1031, 2707, ...,    0,    0,    0],
       [ 101, 1031, 2707, ...,    0,    0,    0],
       ...,
       [ 101, 1031, 2707, ...,    0,    0,    0],
       [ 101, 1031, 2707, ...,    0,    0,    0],
       [ 101, 1031, 2707, ...,    0,    0,    0]], dtype=int32)

In [31]:
history = model.fit(train_data, y_train, batch_size=32, epochs=3)

Epoch 1/3


1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''Only input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: None (of type <class 'NoneType'>)''


ValueError: Exception encountered when calling TransformerForSequenceClassification.call().

[1mOnly input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: None (of type <class 'NoneType'>)[0m

Arguments received by TransformerForSequenceClassification.call():
  • input_ids={'input_ids': 'tf.Tensor(shape=(None, 512), dtype=int32)'}
  • mask={'input_ids': 'None'}

In [None]:
# Evaluate the model
y_pred_probs = model.predict(x_test_padded)
y_pred = np.argmax(y_pred_probs, axis=1)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

In [34]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, optimizers, losses
from transformers import BertTokenizer, BertConfig
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the IMDB dataset
imdb = tf.keras.datasets.imdb
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=20000)

# Define the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and pad sequences
def tokenize_and_pad(texts, tokenizer, max_len=128):
    tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=max_len) for text in texts]
    padded_texts = pad_sequences(tokenized_texts, maxlen=max_len, padding='post', truncating='post')
    return padded_texts

# Convert IMDB dataset to text
word_index = imdb.get_word_index()
index_word = {index + 3: word for word, index in word_index.items()}
index_word[0] = '[PAD]'
index_word[1] = '[START]'
index_word[2] = '[UNK]'
index_word[3] = '[UNUSED]'

def decode_review(encoded_review):
    return ' '.join([index_word.get(i, '?') for i in encoded_review])

x_train_texts = [decode_review(x) for x in x_train]
x_test_texts = [decode_review(x) for x in x_test]

# Tokenize and pad sequences
max_len = 512
x_train_padded = tokenize_and_pad(x_train_texts, tokenizer, max_len)
x_test_padded = tokenize_and_pad(x_test_texts, tokenizer, max_len)

# Load model configuration
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = 2

# Define the embedding layer
class Embeddings(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = layers.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = layers.Embedding(config.max_position_embeddings, config.hidden_size)
        self.layer_norm = layers.LayerNormalization(epsilon=1e-12)
        self.dropout = layers.Dropout(config.hidden_dropout_prob)

    def build(self, input_shape):
        super().build(input_shape)

    def call(self, input_ids):
        seq_length = tf.shape(input_ids)[1]
        position_ids = tf.range(seq_length)[tf.newaxis, :]
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

# Scaled Dot-Product Attention
def scaled_dot_product_attention(query, key, value, mask=None):
    dim_k = tf.cast(tf.shape(key)[-1], tf.float32)
    scores = tf.matmul(query, key, transpose_b=True) / tf.math.sqrt(dim_k)
    if mask is not None:
        scores += (mask * -1e9)
    weights = tf.nn.softmax(scores, axis=-1)
    return tf.matmul(weights, value)

# Attention Head
class AttentionHead(layers.Layer):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = layers.Dense(head_dim)
        self.k = layers.Dense(head_dim)
        self.v = layers.Dense(head_dim)

    def build(self, input_shape):
        super().build(input_shape)

    def call(self, hidden_state, mask=None):
        query = self.q(hidden_state)
        key = self.k(hidden_state)
        value = self.v(hidden_state)
        attn_output = scaled_dot_product_attention(query, key, value, mask)
        return attn_output

# Multi-Head Attention
class MultiHeadAttention(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_dim = config.hidden_size // config.num_attention_heads
        self.attention_heads = [AttentionHead(config.hidden_size, self.head_dim) for _ in range(self.num_heads)]
        self.output_linear = layers.Dense(config.hidden_size)

    def build(self, input_shape):
        super().build(input_shape)

    def call(self, hidden_state, mask=None):
        attn_outputs = [head(hidden_state, mask) for head in self.attention_heads]
        concat_attn = tf.concat(attn_outputs, axis=-1)
        output = self.output_linear(concat_attn)
        return output

# Feed Forward Layer
class FeedForward(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = layers.Dense(config.intermediate_size, activation='gelu')
        self.linear_2 = layers.Dense(config.hidden_size)
        self.dropout = layers.Dropout(config.hidden_dropout_prob)

    def build(self, input_shape):
        super().build(input_shape)

    def call(self, x):
        x = self.linear_1(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

# Transformer Encoder Layer
class TransformerEncoderLayer(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = layers.LayerNormalization(epsilon=1e-12)
        self.layer_norm_2 = layers.LayerNormalization(epsilon=1e-12)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def build(self, input_shape):
        super().build(input_shape)

    def call(self, x, mask=None):
        hidden_state = self.layer_norm_1(x)
        attention_output = self.attention(hidden_state, mask)
        x = x + attention_output
        feed_forward_output = self.feed_forward(self.layer_norm_2(x))
        x = x + feed_forward_output
        return x

# Full Transformer Encoder
class TransformerEncoder(layers.Layer):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.encoder_layers = [TransformerEncoderLayer(config) for _ in range(config.num_hidden_layers)]

    def build(self, input_shape):
        super().build(input_shape)

    def call(self, input_ids, mask=None):
        x = self.embeddings(input_ids)
        for layer in self.encoder_layers:
            x = layer(x, mask)
        return x

# Sequence Classification Model
class TransformerForSequenceClassification(tf.keras.Model):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config)
        self.dropout = layers.Dropout(config.hidden_dropout_prob)
        self.classifier = layers.Dense(config.num_labels)

    def build(self, input_shape):
        super().build(input_shape)

    def call(self, inputs):
        input_ids, mask = inputs['input_ids'], inputs.get('mask', None)
        x = self.encoder(input_ids=input_ids, mask=mask)
        x = self.dropout(x[:, 0, :])
        x = self.classifier(x)
        return x

# Prepare the data for the model
train_data = {'input_ids': tf.convert_to_tensor(x_train_padded)}
test_data = {'input_ids': tf.convert_to_tensor(x_test_padded)}

# Instantiate the model
model = TransformerForSequenceClassification(config)

# Compile the model
optimizer = optimizers.Adam(learning_rate=3e-5)
loss = losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Train the model
history = model.fit(train_data, y_train, batch_size=32, epochs=3, validation_data=(test_data, y_test))

# Evaluate the model
y_pred_probs = model.predict(test_data)
y_pred = np.argmax(y_pred_probs, axis=1)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Epoch 1/3


ValueError: Exception encountered when calling TransformerEncoder.call().

[1mOnly input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: None (of type <class 'NoneType'>)[0m

Arguments received by TransformerEncoder.call():
  • input_ids=tf.Tensor(shape=(None, 512), dtype=int32)
  • mask=None