In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, Layer, Dropout, Reshape, GlobalAveragePooling1D, GlobalMaxPooling1D, Add
from tensorflow.keras.models import Model
from tensorflow import keras
from transformers import TFBertModel, AutoTokenizer
import tensorflow.keras.backend as K

# Document processing imports
from docx import Document
from PyPDF2 import PdfReader
import textract
import json
from pprint import pprint

# System and utility imports
import os
import subprocess
import chardet
import time

2024-08-03 19:35:59.869618: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# File Paths
tfrecord_file = '/home/vignes/Patent_Files/tfrecords/Dataset_json.tfrecord'
patent_folder = '/home/vignes/Patent_Files/zip-Patents/'
standard_folder = '/home/vignes/Patent_Files/Mount_Std/'

In [3]:
def extract_patent_text(patent_file):
    patent_text = ""
    json_file = patent_file + ".json"
    if os.path.isfile(json_file):
        try:
            with open(json_file, 'r', encoding='utf-8') as file:
                data = json.load(file)
                # print(f"Type of data in {json_file}: {type(data)}")
                if isinstance(data, dict):
                    if 'text' in data:
                        patent_text = data['text']
                    else:
                        print(f"Available keys in {json_file}: {data.keys()}")
                elif isinstance(data, list):
                    print(f"Data is a list with {len(data)} items")
                    if len(data) > 0:
                        print(f"Type of first item: {type(data[0])}")
                elif isinstance(data, str):
                    print(f"Data is a string of length {len(data)}")
                    patent_text = data
                else:
                    print(f"Unexpected data type in {json_file}: {type(data)}")
        except json.JSONDecodeError as e:
            print(f"JSON decoding error in {json_file}: {e}")
        except Exception as e:
            print(f"Error reading {json_file}: {e}")
    else:
        print(f"File not found: {json_file}")
    
    return patent_text

def extract_docx_text(docx_file):
    try:
        doc = Document(docx_file)
        return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
    except Exception as e:
        print(f"Exception during DOCX text extraction: {e}")
    return ""

def extract_doc_text(doc_file):
    try:
        command = f"antiword '{doc_file}'"
        result = subprocess.run(command, shell=True, capture_output=True, text=True)
        if result.returncode == 0:
            return result.stdout
        else:
            raise Exception(f"antiword failed: {result.stderr}")
    except Exception as e:
        print(f"Exception during DOC text extraction: {e}")
    return ""

def extract_text(file_path):
    if file_path.endswith('.docx'):
        return extract_docx_text(file_path)
    elif file_path.endswith('.doc'):
        return extract_doc_text(file_path)
    else:
        return ""

def ensure_directory_path(directory_path):
    if not directory_path.endswith('/'):
        directory_path += '/'
    return directory_path

def extract_standard_text(standard_dir):
    standard_text = ""
    standard_dir = ensure_directory_path(os.path.splitext(standard_dir)[0])  # Remove the .zip extension
    if os.path.exists(standard_dir):
        for root, _, files in os.walk(standard_dir):
            for file in files:
                file_path = os.path.join(root, file)
                text = extract_text(file_path)
                standard_text += text + "\n"
    else:
        print(f"Standard directory {standard_dir} does not exist.")
    return standard_text

In [4]:
bert_model_name = 'bert-base-uncased'
bert_model = TFBertModel.from_pretrained(bert_model_name)
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

2024-08-03 19:36:04.628182: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-08-03 19:36:04.629233: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-08-03 19:36:04.631191: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [5]:
def parse_tfrecord(example):
    feature_description = {
        'patent': tf.io.FixedLenFeature([], tf.string),
        'standard': tf.io.FixedLenFeature([], tf.string),
        'match': tf.io.FixedLenFeature([], tf.int64),
    }
    example = tf.io.parse_single_example(example, feature_description)
    return example['patent'], example['standard'], example['match']

# Load the TFRecord dataset
dataset = tf.data.TFRecordDataset(tfrecord_file)
parsed_dataset = dataset.map(parse_tfrecord)

In [6]:

num_samples = 0
for _ in parsed_dataset:
    num_samples += 1
print(f"Total number of samples in the dataset: {num_samples}")

# Define ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Calculate sizes
train_size = int(train_ratio * num_samples)
val_size = int(val_ratio * num_samples)
test_size = num_samples - train_size - val_size  # Ensure we use all remaining samples

print(f"Number of samples in the training set: {train_size}")
print(f"Number of samples in the validation set: {val_size}")
print(f"Number of samples in the test set: {test_size}")

# Split the dataset into training and validation sets
train_dataset = parsed_dataset.take(train_size)
val_dataset = parsed_dataset.take(val_size)
test_dataset = parsed_dataset.take(test_size)


Total number of samples in the dataset: 130476
Number of samples in the training set: 91333
Number of samples in the validation set: 19571
Number of samples in the test set: 19572


2024-08-03 19:36:24.582992: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [7]:
def chunk_text(text, max_length):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_length):
        chunk = words[i:i + max_length]
        chunk = chunk + [''] * (max_length - len(chunk))  # Pad with empty strings
        chunks.append(' '.join(chunk))
    return chunks 

In [8]:
max_length = 100
def data_generator(dataset, tokenizer, max_length=100, max_chunks=10):
    for patent, standard, match in dataset:
        patent_text = extract_patent_text(os.path.join(patent_folder, patent.numpy().decode()))
        standard_text = extract_standard_text(os.path.join(standard_folder, standard.numpy().decode()))

        if not patent_text.strip() or not standard_text.strip():
            print(f"\nSkipping sample: {patent.numpy().decode()},Standard={standard.numpy().decode()}die to empty text.")
            continue 
        
        patent_chunks = chunk_text(patent_text, max_length)
        standard_chunks = chunk_text(standard_text, max_length)

        if not patent_chunks or not standard_chunks:
            print(f"\nSkipping sample: Patent={patent.numpy().decode()}, Standard={standard.numpy().decode()} due to empty chunks.")
            continue
        
        patent_inputs = tokenizer(patent_chunks, padding=True, truncation=True, return_tensors='tf', max_length=max_length)
        standard_inputs = tokenizer(standard_chunks, padding=True, truncation=True, return_tensors='tf', max_length=max_length)
        
        patent_input_ids = patent_inputs['input_ids']
        patent_attention_mask = patent_inputs['attention_mask']
        standard_input_ids = standard_inputs['input_ids']
        standard_attention_mask = standard_inputs['attention_mask']
        
        patent_input_ids = patent_input_ids[:max_chunks]
        patent_attention_mask = patent_attention_mask[:max_chunks]
        standard_input_ids = standard_input_ids[:max_chunks]
        standard_attention_mask = standard_attention_mask[:max_chunks]
        
        patent_input_ids = tf.pad(patent_input_ids, [[0, max_chunks - tf.shape(patent_input_ids)[0]], [0, 0]], constant_values=0)
        patent_attention_mask = tf.pad(patent_attention_mask, [[0, max_chunks - tf.shape(patent_attention_mask)[0]], [0, 0]], constant_values=0)
        standard_input_ids = tf.pad(standard_input_ids, [[0, max_chunks - tf.shape(standard_input_ids)[0]], [0, 0]], constant_values=0)
        standard_attention_mask = tf.pad(standard_attention_mask, [[0, max_chunks - tf.shape(standard_attention_mask)[0]], [0, 0]], constant_values=0)
        
        # print("Patent input IDs shape:", patent_input_ids.shape)
        # print("Patent attention mask shape:", patent_attention_mask.shape)
        # print("Standard input IDs shape:", standard_input_ids.shape)
        # print("Standard attention mask shape:", standard_attention_mask.shape)
        # print("Match value:", match.numpy())
        # print("---")
        
        yield (
            {
                'patent_input_ids': tf.reshape(patent_input_ids, [1, max_chunks * max_length]),
                'patent_attention_mask': tf.reshape(patent_attention_mask, [1, max_chunks * max_length]),
                'standard_input_ids': tf.reshape(standard_input_ids, [1, max_chunks * max_length]),
                'standard_attention_mask': tf.reshape(standard_attention_mask, [1, max_chunks * max_length])
            },
            tf.reshape(tf.constant(match, dtype=tf.int64), [1])
        )

max_chunks = 50# Define the max_chunks variable
batch_size= 2
train_tf_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(train_dataset, tokenizer, max_length=100, max_chunks=max_chunks),
    output_signature=(
        {
            'patent_input_ids': tf.TensorSpec(shape=(1, max_chunks*max_length), dtype=tf.int32),
            'patent_attention_mask': tf.TensorSpec(shape=(1,max_chunks*max_length), dtype=tf.int32),
            'standard_input_ids': tf.TensorSpec(shape=(1,max_chunks*max_length), dtype=tf.int32),
            'standard_attention_mask': tf.TensorSpec(shape=(1,max_chunks* max_length), dtype=tf.int32),
        },
        tf.TensorSpec(shape=(1,), dtype=tf.int32)
    )
).batch(batch_size)

val_tf_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(val_dataset, tokenizer, max_length=100, max_chunks=max_chunks),
    output_signature=(
        {
            'patent_input_ids': tf.TensorSpec(shape=(1,max_chunks* max_length), dtype=tf.int32),
            'patent_attention_mask': tf.TensorSpec(shape=(1,max_chunks* max_length), dtype=tf.int32),
            'standard_input_ids': tf.TensorSpec(shape=(1,max_chunks* max_length), dtype=tf.int32),
            'standard_attention_mask': tf.TensorSpec(shape=(1,max_chunks* max_length), dtype=tf.int32),
        },
        tf.TensorSpec(shape=(1,), dtype=tf.int32)
    )
).batch(batch_size)

test_tf_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(test_dataset, tokenizer, max_length = 100, max_chunks= max_chunks),
    output_signature = (
        {
            'patent_input_ids' : tf.TensorSpec(shape=(1, max_chunks * max_length), dtype = tf.int32),
            'patent_attention_mask' : tf.TensorSpec(shape =(1, max_chunks * max_length), dtype = tf.int32), 
            'standard_input_ids' : tf.TensorSpec(shape=(1,max_chunks*max_length), dtype = tf.int32),
            'standard_attention_mask': tf.TensorSpec(shape = (1, max_chunks * max_length), dtype= tf.int32),
        },
        tf.TensorSpec(shape=(1,),dtype= tf.int32)
    )
).batch(batch_size)

In [9]:
class BertModelLayer(keras.layers.Layer):
    def __init__(self, bert_model, **kwargs):
        super(BertModelLayer, self).__init__(**kwargs)
        self.bert_model = bert_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        input_ids = tf.cast(input_ids, tf.int32)  # Cast input_ids to int32
        attention_mask = tf.cast(attention_mask, tf.int32)  # Cast attention_mask to int32
        outputs = self.bert_model(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state

    def get_config(self):
        base_config = super(BertModelLayer, self).get_config()
        config = {
            "bert_model": tf.keras.legacy.saving.serialize_keras_object(self.bert_model),
        }
        return {**base_config, **config}

    @classmethod
    def from_config(cls, config):
        bert_model_config = config.pop("bert_model")
        bert_model = keras.saving.deserialize_keras_object(bert_model_config)
        return cls(bert_model=bert_model, **config)

class ReshapeLayer(Layer):
    def __init__(self, target_shape, **kwargs):
        super(ReshapeLayer, self).__init__(**kwargs)
        self.target_shape = target_shape

    def call(self, inputs):
        return tf.reshape(inputs, shape=self.target_shape)

class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros")
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
        at = K.softmax(et)
        at = K.expand_dims(at, axis=-1)
        output = x * at
        return K.sum(output, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

class CrossAttention(Layer):
    def __init__(self, **kwargs):
        super(CrossAttention, self).__init__(**kwargs)

    def call(self, inputs):
        query, key, value = inputs
        attention_scores = tf.matmul(query, key, transpose_b=True)
        attention_scores = tf.nn.softmax(attention_scores, axis=-1)
        return tf.matmul(attention_scores, value)

# def cross_attention(query, key, value):
#     attention_scores = tf.matmul(query, key, transpose_b=True)
#     attention_scores = tf.nn.softmax(attention_scores, axis=-1)
#     return tf.matmul(attention_scores, value)

class ResidualBlock(Layer):
    def __init__(self, units, **kwargs):
        super(ResidualBlock, self).__init__(**kwargs)
        self.units = units
        self.dense1 = Dense(units, activation='relu')
        self.dense2 = Dense(units, activation='linear')

    def build(self, input_shape):
        self.dense2 = Dense(input_shape[-1], activation='linear')
        super(ResidualBlock, self).build(input_shape)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return Add()([inputs, x])

    def get_config(self):
        config = super(ResidualBlock, self).get_config()
        config.update({"units": self.units})
        return config
        
def create_model(max_length, max_chunks):
    # Input layers
    patent_input_ids = Input(shape=(1, max_chunks*max_length), dtype=tf.int32, name='patent_input_ids')
    patent_attention_mask = Input(shape=(1, max_chunks*max_length), dtype=tf.int32, name='patent_attention_mask')
    standard_input_ids = Input(shape=(1, max_chunks*max_length), dtype=tf.int32, name='standard_input_ids')
    standard_attention_mask = Input(shape=(1, max_chunks*max_length), dtype=tf.int32, name='standard_attention_mask')

    # Reshape layers
    reshape_layer = ReshapeLayer(target_shape=(-1, max_chunks*max_length))
    patent_input_ids_reshaped = reshape_layer(patent_input_ids)
    patent_attention_mask_reshaped = reshape_layer(patent_attention_mask)
    standard_input_ids_reshaped = reshape_layer(standard_input_ids)
    standard_attention_mask_reshaped = reshape_layer(standard_attention_mask)

    # BERT layers
    patent_bert = BertModelLayer(TFBertModel.from_pretrained('bert-base-uncased'), trainable=True)
    standard_bert = BertModelLayer(TFBertModel.from_pretrained('bert-base-uncased'), trainable=True)

    # BERT embeddings
    patent_embeddings = patent_bert([patent_input_ids_reshaped, patent_attention_mask_reshaped])
    standard_embeddings = standard_bert([standard_input_ids_reshaped, standard_attention_mask_reshaped])

    # Cross-attention
    cross_attention_layer = CrossAttention()
    patent_cross = cross_attention_layer([patent_embeddings, standard_embeddings, standard_embeddings])
    standard_cross = cross_attention_layer([standard_embeddings, patent_embeddings, patent_embeddings])

    patent_combined = Concatenate()([patent_embeddings, patent_cross])
    standard_combined = Concatenate()([standard_embeddings, standard_cross])

    # Pooling
    patent_max_pooled = GlobalMaxPooling1D()(patent_combined)
    patent_avg_pooled = GlobalAveragePooling1D()(patent_combined)
    patent_attention = AttentionLayer()(patent_combined)
    patent_pooled = Concatenate()([patent_max_pooled, patent_avg_pooled, patent_attention])

    standard_max_pooled = GlobalMaxPooling1D()(standard_combined)
    standard_avg_pooled = GlobalAveragePooling1D()(standard_combined)
    standard_attention = AttentionLayer()(standard_combined)
    standard_pooled = Concatenate()([standard_max_pooled, standard_avg_pooled, standard_attention])

    # Concatenate pooled outputs
    concatenated = Concatenate()([patent_pooled, standard_pooled])

    # Dense layers with residual connections
    x = Dense(1024, activation='relu')(concatenated)
    x = ResidualBlock(512)(x)
    x = Dropout(0.5)(x)
    x = ResidualBlock(256)(x)
    x = Dropout(0.5)(x)
    x = ResidualBlock(128)(x)
    x = Dropout(0.5)(x)

    # Final dense layers
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation='sigmoid')(x)

    # Model
    model = Model(inputs=[patent_input_ids, patent_attention_mask, standard_input_ids, standard_attention_mask], outputs=output)

    return model

# Create and compile the model
model = create_model(max_length, max_chunks)

initial_learning_rate = 0.001  # Assuming this was your initial rate
new_learning_rate = initial_learning_rate * 2  # Double it

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    new_learning_rate,
    decay_steps=10000,
    decay_rate=0.96,
    staircase=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [10]:
num_epochs=2

# Create a directory for saving models if it doesn't exist
model_dir = 'saved_models_BERTcomplex'
os.makedirs(model_dir, exist_ok=True)

# Define a custom callback for periodic saving
class PeriodicSaver(tf.keras.callbacks.Callback):
    def __init__(self, save_freq=300, save_path='saved_models_BERTcomplex'):
        super(PeriodicSaver, self).__init__()
        self.save_freq = save_freq  # Save frequency in seconds
        self.save_path = save_path
        self.last_save_time = time.time()

    def on_batch_end(self, batch, logs=None):
        if time.time() - self.last_save_time > self.save_freq:
            print("\nSaving periodic model checkpoint...")
            self.model.save(os.path.join(self.save_path, f'periodic_model_{int(time.time())}.keras'))
            self.last_save_time = time.time()

# Define the callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3),
    tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(model_dir, 'model.{epoch:02d}--{val_loss:.2f}.keras'), 
        monitor='val_accuracy', 
        save_best_only=True
    ),
    PeriodicSaver(save_freq=300, save_path=model_dir)  # Save every 5 minutes
]

final_model_path = None # Initialize the variable

try:
    # Start training
    model.fit(train_tf_dataset,
              epochs=num_epochs,
              callbacks=callbacks,
              validation_data=val_tf_dataset)

    # If training completes successfully, save the final model
    final_model_path = os.path.join(model_dir, 'final_model.keras')
    model.save(final_model_path)
    print("Training completed. Final model saved.")

except Exception as e:
    print(f"An error occurred during training: {e}")
    # The last periodic save will be the most recent model state
    periodic_saves = [f for f in os.listdir(model_dir) if f.startswith('periodic_model_')]
    if periodic_saves:
        latest_save = max(periodic_saves, key=lambda x: int(x.split('_')[2].split('.')[0]))
        final_model_path = os.path.join(model_dir, latest_save)
        print(f"Using the most recent periodic save: {final_model_path}")
    else:
        print("No periodic saves found. Unable to evaluate the model.")

finally:
    # This block will be executed whether there's an exception or not
    print("Training process finished.") 

Epoch 1/2
Data is a string of length 74466
Data is a string of length 70203
Data is a string of length 66217


I0000 00:00:1722728231.043387 2293736 service.cc:145] XLA service 0x2bf85930 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1722728231.043438 2293736 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce GTX 1080 Ti, Compute Capability 6.1
I0000 00:00:1722728231.043443 2293736 service.cc:153]   StreamExecutor device (1): NVIDIA GeForce GTX 1080 Ti, Compute Capability 6.1


Data is a string of length 41435


2024-08-03 19:37:12.291046: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
W0000 00:00:1722728232.847978 2293736 assert_op.cc:38] Ignoring Assert operator functional_1_1/bert_model_layer_1_1/tf_bert_model_2/bert/embeddings/assert_less/Assert/Assert
W0000 00:00:1722728232.852398 2293736 assert_op.cc:38] Ignoring Assert operator functional_1_1/bert_model_layer_2/tf_bert_model_1/bert/embeddings/assert_less/Assert/Assert
2024-08-03 19:37:17.177609: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8902
I0000 00:00:1722728252.396650 2293736 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


      1/Unknown [1m47s[0m 47s/step - accuracy: 0.0000e+00 - loss: 5.7392Data is a string of length 79689
      2/Unknown [1m49s[0m 2s/step - accuracy: 0.0000e+00 - loss: 7.5344 Data is a string of length 116738
Data is a string of length 76258
      3/Unknown [1m54s[0m 3s/step - accuracy: 0.0556 - loss: 9.0320    Data is a string of length 92048
Data is a string of length 99918
      4/Unknown [1m57s[0m 3s/step - accuracy: 0.1354 - loss: 9.0291Data is a string of length 115843
Data is a string of length 80657
Data is a string of length 119004
      5/Unknown [1m60s[0m 3s/step - accuracy: 0.1883 - loss: 28.4280Data is a string of length 232200
      6/Unknown [1m64s[0m 3s/step - accuracy: 0.2403 - loss: 38.4155Data is a string of length 51929
Data is a string of length 103652
Data is a string of length 49709
      7/Unknown [1m66s[0m 3s/step - accuracy: 0.2774 - loss: 46.8673Data is a string of length 58826
      8/Unknown [1m69s[0m 3s/step - accuracy: 0.3052 - loss: 55.

KeyboardInterrupt: 

Data is a string of length 79145
Data is a string of length 71634
Data is a string of length 113746
Data is a string of length 85087
Data is a string of length 68539
Data is a string of length 48134
Data is a string of length 31306


In [11]:
predictions = model.predict(test_tf_dataset)

Data is a string of length 74466
Data is a string of length 70203
Data is a string of length 66217
Data is a string of length 41435


W0000 00:00:1722747076.526418 2293736 assert_op.cc:38] Ignoring Assert operator functional_1_1/bert_model_layer_1_1/tf_bert_model_2/bert/embeddings/assert_less/Assert/Assert
W0000 00:00:1722747076.529821 2293736 assert_op.cc:38] Ignoring Assert operator functional_1_1/bert_model_layer_2/tf_bert_model_1/bert/embeddings/assert_less/Assert/Assert


      1/Unknown [1m10s[0m 10s/stepData is a string of length 79689
Data is a string of length 116738
      2/Unknown [1m12s[0m 2s/step Data is a string of length 76258
      3/Unknown [1m14s[0m 2s/stepData is a string of length 92048
Data is a string of length 99918
      4/Unknown [1m17s[0m 3s/stepData is a string of length 115843
Data is a string of length 80657
      5/Unknown [1m21s[0m 3s/stepData is a string of length 119004
Data is a string of length 232200
      6/Unknown [1m24s[0m 3s/stepData is a string of length 51929
Data is a string of length 103652
      7/Unknown [1m26s[0m 3s/stepData is a string of length 49709
Data is a string of length 58826
      8/Unknown [1m29s[0m 3s/stepData is a string of length 105344
Data is a string of length 76603
      9/Unknown [1m31s[0m 3s/stepData is a string of length 92526
Data is a string of length 100970
     10/Unknown [1m36s[0m 3s/stepData is a string of length 135458
Data is a string of length 64801
     11/Unkno

2024-08-04 10:22:12.800518: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
W0000 00:00:1722781338.875325 2293734 assert_op.cc:38] Ignoring Assert operator functional_1_1/bert_model_layer_1_1/tf_bert_model_2/bert/embeddings/assert_less/Assert/Assert
W0000 00:00:1722781338.877485 2293734 assert_op.cc:38] Ignoring Assert operator functional_1_1/bert_model_layer_2/tf_bert_model_1/bert/embeddings/assert_less/Assert/Assert


[1m9683/9683[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34274s[0m 4s/step


  self.gen.throw(value)
2024-08-04 10:22:23.882764: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [12]:
predictions

array([[0.39790002],
       [0.39790002],
       [0.39790002],
       ...,
       [0.39790002],
       [0.39790002],
       [0.39790002]], dtype=float32)