In [None]:
# model_training.ipynb

# Import necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import functions from utils.py
from utils import (
    create_diffusion_model,
    train_diffusion_model,
    create_transformer_model,
    train_transformer_model,
    load_encoder,
    string_with_tashkeel_vectorizer_per_batch
)

# Configure visualization settings
%matplotlib inline


In [None]:
# Define paths
processed_data_path = '../data/processed/processed_data.csv'
diffusion_output_dir = '../models/diffusion'
transformer_output_dir = '../models/transformers'

# Load processed data
print("Loading processed data...")
processed_df = pd.read_csv(processed_data_path, encoding='utf-8-sig')
print(f"Processed data loaded with {len(processed_df)} records.")

# For testing, use a small subset
subset_size = 100  # Adjust as needed
train_df, valid_df = train_test_split(processed_df, test_size=0.2, random_state=42)
train_subset = train_df.sample(n=subset_size, random_state=42)
valid_subset = valid_df.sample(n=20, random_state=42)
print(f"Using a subset of {subset_size} training records and 20 validation records for testing.")


In [None]:
# Define model parameters
diffusion_model_params = {
    'num_transformer_blocks': 2,
    'num_heads': 4,
    'key_dim': 64,
    'ffn_units': 256
}

# Define input shape
max_bayt_len = 1000  # Ensure this matches preprocessing
encoding_dim = 8
input_shape = (max_bayt_len, encoding_dim)

# Create diffusion model
print("Creating diffusion model...")
diffusion_model = create_diffusion_model(input_shape, diffusion_model_params)


In [None]:
# Prepare training and validation data
X_train = np.array([entry['text'] for entry in train_subset.to_dict('records')])
Y_train = X_train.copy()  # For diffusion models, target is often the clean data

X_valid = np.array([entry['text'] for entry in valid_subset.to_dict('records')])
Y_valid = X_valid.copy()

# Vectorize data
print("Vectorizing training data...")
X_train_enc = string_with_tashkeel_vectorizer_per_batch(pd.Series(X_train), max_bayt_len)
Y_train_enc = string_with_tashkeel_vectorizer_per_batch(pd.Series(Y_train), max_bayt_len)

print("Vectorizing validation data...")
X_valid_enc = string_with_tashkeel_vectorizer_per_batch(pd.Series(X_valid), max_bayt_len)
Y_valid_enc = string_with_tashkeel_vectorizer_per_batch(pd.Series(Y_valid), max_bayt_len)

# Train diffusion model
print("Training diffusion model on subset...")
history = diffusion_model.fit(
    X_train_enc, Y_train_enc,
    validation_data=(X_valid_enc, Y_valid_enc),
    epochs=5,  # Fewer epochs for testing
    batch_size=16,
    verbose=1
)

# Plot training history
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Diffusion Model Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
# Define transformer model parameters
transformer_model_name = 'aubmindlab/bert-base-arabertv2'  # Ensure this model is installed
max_length = 1000

# Create transformer model
print("Creating transformer model...")
transformer_model, tokenizer = create_transformer_model(transformer_model_name, max_length)

# Prepare training and validation data
train_texts = train_subset['text'].tolist()
valid_texts = valid_subset['text'].tolist()

# Train transformer model
print("Training transformer model on subset...")
transformer_history = train_transformer_model(
    model=transformer_model,
    tokenizer=tokenizer,
    train_data=train_subset.to_dict('records'),
    valid_data=valid_subset.to_dict('records'),
    epochs=5,  # Fewer epochs for testing
    batch_size=8,
    output_dir=transformer_output_dir,
    max_length=max_length
)

# Plot training history
plt.figure(figsize=(12, 6))
plt.plot(transformer_history.history['loss'], label='Train Loss')
plt.plot(transformer_history.history['val_loss'], label='Validation Loss')
plt.title('Transformer Model Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
# Save diffusion model
diffusion_final_path = os.path.join(diffusion_output_dir, 'diffusion_model_test_final.h5')
diffusion_model.save(diffusion_final_path)
print(f"Diffusion model saved to {diffusion_final_path}")

# Save transformer model
transformer_final_path = os.path.join(transformer_output_dir, 'transformer_model_test_final.h5')
transformer_model.save(transformer_final_path)
print(f"Transformer model saved to {transformer_final_path}")
