In [7]:
import random
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras  
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    TFAutoModelForSequenceClassification,
)


# === CONFIGURATION CONSTANTS ===
LANGUAGE = 'en'
MODEL_TYPE = 'distilbert-base-uncased' if LANGUAGE == 'en' else 'distilbert-base-multilingual-cased'
MODEL_INPUTS = frozenset(['input_ids', 'attention_mask'])
 
EPOCHS = 2       # keep low for demo
 
NUM_DOCS = 10   # mock dataset size

BATCH_SIZE = 4
EVAL_BATCH_SIZE = 8
MAX_SEQ_LENGTH = 64

# === MOCK Avro-Like Data Generator ===
def mock_avro_reader(num_docs=NUM_DOCS):
    samples = [
        ("<p>This agreement is made between the company and the contractor.</p>", True),
        ("<h1>Invoice</h1><p>Date: 2024-09-12</p>", False),
        ("<div>Employment contract for the following individual...</div>", True),
        ("<p>Purchase order for materials...</p>", False),
        ("<div>Lease agreement details: terms, parties, rent.</div>", True),
        ("<p>Meeting notes: Project timeline, deliverables, next steps...</p>", False),
    ]
    for _ in range(num_docs):
        html, is_contract = random.choice(samples)
        yield {
            'body': html,
            'labels': {'contract': is_contract}
        }

# === MOCK html_labels2text ===
def html_labels2text(doc):
    # Very simple HTML-stripping mock
    text = re.sub('<[^<]+?>', '', doc['body'])
    return {'text': text}

# === PREPARE LABELLED DATA ===
labels_with_text = []
avro_reader = mock_avro_reader(NUM_DOCS)
for document in avro_reader:
    document_text = html_labels2text({'body': document['body']})['text']
    label = 1 if document['labels'].get('contract') else 0
    labels_with_text.append((label, document_text))

# === LOAD TOKENIZER ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE)

def encode(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        padding='max_length',      # Explicitly pad to max length
        max_length=MAX_SEQ_LENGTH,
        truncation=True,
        return_tensors='tf',
    )
    return {k: v for k, v in inputs.items() if k in MODEL_INPUTS}

# === ENCODE DATA ===
data = [
    ({k: v[0] for k, v in encode(text).items()}, label)
    for label, text in tqdm(labels_with_text, desc="Encoding texts")
]

# === TF DATASET CREATION ===
def _data():
    yield from data

dataset = tf.data.Dataset.from_generator(
    _data,
    output_signature=(
        {k: tf.TensorSpec(shape=(MAX_SEQ_LENGTH,), dtype=tf.int32) for k in MODEL_INPUTS},
        tf.TensorSpec(shape=(), dtype=tf.int32),
    ),
)

dataset = dataset.shuffle(
    buffer_size=NUM_DOCS, seed=0, reshuffle_each_iteration=False
)

# === SPLIT TRAIN AND TEST ===
total_size = len(data)
train_size = int(0.9 * total_size)
test_size = total_size - train_size
train_dataset = dataset.take(train_size)
test_dataset = dataset.skip(train_size)
train_dataset = train_dataset.batch(BATCH_SIZE).repeat(-1)
test_dataset = test_dataset.batch(EVAL_BATCH_SIZE)

# === LOAD AND COMPILE MODEL ===
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_TYPE, num_labels=2, from_pt=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ["accuracy"]
model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)

steps_per_epoch = train_size // BATCH_SIZE
validation_steps = test_size // EVAL_BATCH_SIZE

# === TRAIN MODEL ===
model.fit(
    train_dataset,
    epochs=EPOCHS,
    steps_per_epoch=steps_per_epoch,
    validation_data=test_dataset,
    validation_steps=validation_steps
)

# === EVALUATE MODEL ===
results = model.evaluate(test_dataset, steps=validation_steps)
print(f"\nTest Loss: {results[0]:.4f} | Test Accuracy: {results[1]:.4f}")



Encoding texts: 100%|██████████| 10/10 [00:00<00:00, 2625.38it/s]
2025-10-06 10:34:43.811717: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:501] Allocator (GPU_0_bfc) ran out of memory trying to allocate 89.42MiB (rounded to 93763584)requested by op StatelessTruncatedNormalV2
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2025-10-06 10:34:43.811753: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1049] BFCAllocator dump for GPU_0_bfc
2025-10-06 10:34:43.811761: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1056] Bin (256): 	Total Chunks: 72, Chunks in use: 72. 18.0KiB allocated for chunks. 18.0KiB in use in bin. 413B client-requested in use in bin.
2025-10-06 10:34:43.811765: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1056] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for ch

ResourceExhaustedError: {{function_node __wrapped__StatelessTruncatedNormalV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[30522,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:StatelessTruncatedNormalV2] name: 

/task:0/device:GPU:0 by allocator GPU_0_bfc
