In [1]:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import tensorflow as tf


2024-12-31 15:14:58.303420: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-31 15:14:58.303517: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-31 15:14:58.304775: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-31 15:14:58.314242: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import sys
import os

# Add the scripts folder to the system path
scripts_path = os.path.abspath(os.path.join('..', 'scripts/python'))
sys.path.append(scripts_path)


In [4]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from utils import (
    create_diffusion_model,
    train_diffusion_model,
    create_transformer_model,
    train_transformer_model,
    string_with_tashkeel_vectorizer_per_batch,
    calculate_meter_accuracy
)

# Configure visualization settings
%matplotlib inline

In [5]:
# Paths
processed_data_path = '../data/processed/processed_taweel_data.csv'
diffusion_output_dir = '../models/diffusion'
transformer_output_dir = '../models/transformers'
os.makedirs(diffusion_output_dir, exist_ok=True)
os.makedirs(transformer_output_dir, exist_ok=True)

# Load processed data
print("Loading processed data...")
processed_df = pd.read_csv(processed_data_path, encoding='utf-8-sig')
print(f"Processed data loaded with {len(processed_df)} records.")

# For testing, use a subset or full dataset
subset = True  # Toggle this for subset testing or full dataset training
if subset:
    print("Using subset for testing...")
    train_df, valid_df = train_test_split(processed_df, test_size=0.2, random_state=42)
    train_subset = train_df.sample(n=100, random_state=42)  # Adjust as needed
    valid_subset = valid_df.sample(n=20, random_state=42)
else:
    train_df, valid_df = train_test_split(processed_df, test_size=0.2, random_state=42)
    train_subset, valid_subset = train_df, valid_df

print(f"Training records: {len(train_subset)}; Validation records: {len(valid_subset)}")


Loading processed data...
Processed data loaded with 103441 records.
Using subset for testing...
Training records: 100; Validation records: 20


In [6]:
# Define model parameters
diffusion_model_params = {
    'num_transformer_blocks': 4,
    'num_heads': 8,
    'key_dim': 64,
    'ffn_units': 512
}

# Input shape
max_bayt_len = 1000  # Matches preprocessing
encoding_dim = 8
input_shape = (max_bayt_len, encoding_dim)

In [7]:
# =============================================================================
# Diffusion Model
# =============================================================================

print("Creating diffusion model...")
diffusion_model = create_diffusion_model(input_shape, diffusion_model_params)

# Prepare data
X_train = train_subset['text'].tolist()
Y_train = X_train.copy()  # Target is clean data

X_valid = valid_subset['text'].tolist()
Y_valid = X_valid.copy()

# Vectorize data
print("Vectorizing training and validation data...")
X_train_enc = string_with_tashkeel_vectorizer_per_batch(pd.Series(X_train), max_bayt_len)
Y_train_enc = string_with_tashkeel_vectorizer_per_batch(pd.Series(Y_train), max_bayt_len)
X_valid_enc = string_with_tashkeel_vectorizer_per_batch(pd.Series(X_valid), max_bayt_len)
Y_valid_enc = string_with_tashkeel_vectorizer_per_batch(pd.Series(Y_valid), max_bayt_len)

# Train diffusion model
print("Training diffusion model...")
diffusion_history = diffusion_model.fit(
    X_train_enc, Y_train_enc,
    validation_data=(X_valid_enc, Y_valid_enc),
    epochs=50,
    batch_size=32,
    verbose=1
)

# Save diffusion model
diffusion_final_path = os.path.join(diffusion_output_dir, 'diffusion_model_taweel_final.h5')
diffusion_model.save(diffusion_final_path)
print(f"Diffusion model saved to {diffusion_final_path}")

Creating diffusion model...


2024-12-31 15:16:40.482471: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-31 15:16:40.840986: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-31 15:16:40.841199: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-31 15:16:40.852375: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-31 15:16:40.852920: I external/local_xla/xla/stream_executor

Model: "DiffusionModel"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_layer (InputLayer)    [(None, 1000, 8)]            0         []                            
                                                                                                  
 layer_normalization (Layer  (None, 1000, 8)              16        ['input_layer[0][0]']         
 Normalization)                                                                                   
                                                                                                  
 multi_head_attention (Mult  (None, 1000, 8)              17928     ['layer_normalization[0][0]', 
 iHeadAttention)                                                     'layer_normalization[0][0]'] 
                                                                                     

2024-12-31 15:16:49.968899: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
2024-12-31 15:17:00.725506: W external/local_tsl/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 976.56MiB (rounded to 1024000000)requested by op DiffusionModel/multi_head_attention_2/softmax/Softmax
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2024-12-31 15:17:00.725623: I external/local_tsl/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2024-12-31 15:17:00.725641: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 114, Chunks in use: 113. 28.5KiB allocated for chunks. 28.2KiB in use in bin. 3.4KiB client-requested in use in bin.
2024-12-31 15:17:00.725648: I external/local_tsl/tsl/framework/bfc_allocator.cc:104

ResourceExhaustedError: Graph execution error:

Detected at node DiffusionModel/multi_head_attention_2/softmax/Softmax defined at (most recent call last):
  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/runpy.py", line 197, in _run_module_as_main

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/runpy.py", line 87, in _run_code

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/asyncio/base_events.py", line 601, in run_forever

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/asyncio/events.py", line 80, in _run

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3048, in run_cell

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3103, in _run_cell

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3308, in run_cell_async

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3490, in run_ast_nodes

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3550, in run_code

  File "/tmp/ipykernel_939/809680305.py", line 24, in <module>

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/engine/training.py", line 1807, in fit

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/engine/training.py", line 1401, in train_function

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/engine/training.py", line 1384, in step_function

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/engine/training.py", line 1373, in run_step

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/engine/training.py", line 1150, in train_step

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/engine/training.py", line 590, in __call__

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/engine/functional.py", line 515, in call

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/engine/functional.py", line 672, in _run_internal_graph

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/layers/attention/multi_head_attention.py", line 600, in call

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/layers/attention/multi_head_attention.py", line 533, in _compute_attention

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/layers/attention/multi_head_attention.py", line 499, in _masked_softmax

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/layers/activation/softmax.py", line 107, in call

  File "/home/tarek/miniconda3/envs/baytdiffuser_env/lib/python3.9/site-packages/keras/src/backend.py", line 5448, in softmax

OOM when allocating tensor with shape[32,8,1000,1000] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node DiffusionModel/multi_head_attention_2/softmax/Softmax}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_9258]

In [None]:
# =============================================================================
# Transformer Model
# =============================================================================

transformer_model_name = 'aubmindlab/bert-base-arabertv2'
max_length = 1000

print("Creating transformer model...")
transformer_model, tokenizer = create_transformer_model(transformer_model_name, max_length)

train_texts = train_subset['text'].tolist()
valid_texts = valid_subset['text'].tolist()

# Train transformer model
print("Training transformer model...")
transformer_history = train_transformer_model(
    model=transformer_model,
    tokenizer=tokenizer,
    train_data=train_subset.to_dict('records'),
    valid_data=valid_subset.to_dict('records'),
    epochs=50,
    batch_size=16,
    output_dir=transformer_output_dir,
    max_length=max_length
)

# Save transformer model
transformer_final_path = os.path.join(transformer_output_dir, 'transformer_model_taweel_final.h5')
transformer_model.save(transformer_final_path)
print(f"Transformer model saved to {transformer_final_path}")

# Save tokenizer
tokenizer_final_path = os.path.join(transformer_output_dir, 'tokenizer_taweel_final.json')
tokenizer.save_pretrained(transformer_output_dir)
print(f"Tokenizer saved to {tokenizer_final_path}")

In [None]:
# Save diffusion model
diffusion_final_path = os.path.join(diffusion_output_dir, 'diffusion_model_test_final.h5')
diffusion_model.save(diffusion_final_path)
print(f"Diffusion model saved to {diffusion_final_path}")

# Save transformer model
transformer_final_path = os.path.join(transformer_output_dir, 'transformer_model_test_final.h5')
transformer_model.save(transformer_final_path)
print(f"Transformer model saved to {transformer_final_path}")


In [None]:
# =============================================================================
# Plot Training Histories
# =============================================================================

# Diffusion Model
plt.figure(figsize=(12, 6))
plt.plot(diffusion_history.history['loss'], label='Train Loss')
plt.plot(diffusion_history.history['val_loss'], label='Validation Loss')
plt.title('Diffusion Model Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Transformer Model
plt.figure(figsize=(12, 6))
plt.plot(transformer_history.history['loss'], label='Train Loss')
plt.plot(transformer_history.history['val_loss'], label='Validation Loss')
plt.title('Transformer Model Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
