<a href="https://colab.research.google.com/github/tameemtantawy/Cliff-Diver-AI/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
###############################################################################
# 1. ENVIRONMENT SETUP
###############################################################################
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Set random seeds for reproducibility
SEED = 1234
np.random.seed(SEED)
tf.random.set_seed(SEED)

###############################################################################
# 2. DATA LOADING (IMDB) + TRAIN/VAL SPLIT
###############################################################################
(ds_train, ds_test), ds_info = tfds.load(
    'imdb_reviews',
    split=['train', 'test'],
    as_supervised=True,
    with_info=True
)

train_texts, train_labels = [], []
for text, label in tfds.as_numpy(ds_train):
    train_texts.append(text.decode('utf-8'))
    train_labels.append(label)
train_labels = np.array(train_labels)

# Create validation set (20% from train)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels,
    test_size=0.2,
    random_state=SEED
)

test_texts, test_labels = [], []
for text, label in tfds.as_numpy(ds_test):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(label)
test_labels = np.array(test_labels)

print("Train samples:", len(train_texts))
print("Validation samples:", len(val_texts))
print("Test samples:", len(test_texts))

###############################################################################
# 3. TOKENIZATION (CHAR-LEVEL & WORD-LEVEL)
###############################################################################
# -- 3.1 Character-level Tokenizer --
char_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=True,  # tokenize by characters
    lower=True
)
char_tokenizer.fit_on_texts(train_texts)

X_train_char = char_tokenizer.texts_to_matrix(train_texts, mode='binary')
X_val_char   = char_tokenizer.texts_to_matrix(val_texts,   mode='binary')
X_test_char  = char_tokenizer.texts_to_matrix(test_texts,  mode='binary')

print("Character-level vocab size:", len(char_tokenizer.word_index)+1)
print("X_train_char shape:", X_train_char.shape)

# -- 3.2 Word-level Tokenizer --
#    IMPORTANT: Limit the vocabulary to avoid huge input dimensions.
#    This helps prevent memory issues or crashes in Colab.
word_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=5000,  # Limit to top 5000 words
    char_level=False,
    lower=True
)
word_tokenizer.fit_on_texts(train_texts)

X_train_word = word_tokenizer.texts_to_matrix(train_texts, mode='binary')
X_val_word   = word_tokenizer.texts_to_matrix(val_texts,   mode='binary')
X_test_word  = word_tokenizer.texts_to_matrix(test_texts,  mode='binary')

# The actual vocabulary size is "num_words" or smaller
print("Word-level vocab size (limit=5000):", len(word_tokenizer.word_index)+1)
print("X_train_word shape:", X_train_word.shape)

# -- 3.3 Convert Labels to One-Hot --
def one_hot_encode(labels, num_classes=2):
    return np.eye(num_classes)[labels]

y_train_char = one_hot_encode(train_labels)
y_val_char   = one_hot_encode(val_labels)
y_test_char  = one_hot_encode(test_labels)

# For word-level, labels are the same
y_train_word = y_train_char
y_val_word   = y_val_char
y_test_word  = y_test_char

###############################################################################
# 4. MLP MODEL DEFINITION (Keras) & HYPERPARAMETERS
###############################################################################
def build_mlp(input_dim, hidden_sizes=[128, 64], output_dim=2, activation='relu'):
    """
    Build a Sequential MLP model in Keras, returning uncompiled model.
    """
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.InputLayer(shape=(input_dim,)))
    for size in hidden_sizes:
        model.add(tf.keras.layers.Dense(size, activation=activation))
    # Final layer (logits)
    model.add(tf.keras.layers.Dense(output_dim))
    return model

def compile_model(model, learning_rate=0.001, optimizer_name='adam'):
    """
    Compile model with given optimizer and learning rate. Returns the compiled model.
    """
    if optimizer_name.lower() == 'adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate)
    elif optimizer_name.lower() == 'sgd':
        optimizer = tf.keras.optimizers.SGD(learning_rate)
    elif optimizer_name.lower() == 'rmsprop':
        optimizer = tf.keras.optimizers.RMSprop(learning_rate)
    else:
        raise ValueError("Unsupported optimizer. Use 'adam', 'sgd', or 'rmsprop'.")

    model.compile(
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
        optimizer=optimizer,
        metrics=['accuracy']
    )
    return model

###############################################################################
# 5. TRAIN & EVALUATE (CHAR-LEVEL)
###############################################################################
print("\n--- CHAR-LEVEL TRAINING ---")
input_dim_char = X_train_char.shape[1]

# Build model (char-level)
char_model = build_mlp(
    input_dim=input_dim_char,
    hidden_sizes=[128, 64],   # Adjust as needed
    output_dim=2,
    activation='relu'
)
# Compile
char_model = compile_model(
    char_model,
    learning_rate=0.001,
    optimizer_name='adam'
)

# Train
history_char = char_model.fit(
    X_train_char, y_train_char,
    validation_data=(X_val_char, y_val_char),
    epochs=5,
    batch_size=128,   # Larger batch is usually okay for smaller char vocab
    verbose=1
)

# Evaluate
test_loss_char, test_acc_char = char_model.evaluate(
    X_test_char, y_test_char, verbose=0
)
print(f"Char-level Test Accuracy: {test_acc_char:.4f}")

###############################################################################
# 6. TRAIN & EVALUATE (WORD-LEVEL)
###############################################################################
print("\n--- WORD-LEVEL TRAINING ---")
input_dim_word = X_train_word.shape[1]

# Build model (word-level)
# Use smaller hidden layers if needed, because the input dimension could be big
word_model = build_mlp(
    input_dim=input_dim_word,
    hidden_sizes=[64, 32],  # A bit smaller
    output_dim=2,
    activation='relu'
)

# Compile
word_model = compile_model(
    word_model,
    learning_rate=0.001,
    optimizer_name='adam'
)

# Train (Use smaller batch size to reduce memory usage)
history_word = word_model.fit(
    X_train_word, y_train_word,
    validation_data=(X_val_word, y_val_word),
    epochs=5,
    batch_size=64,   # Smaller batch size
    verbose=1
)

# Evaluate
test_loss_word, test_acc_word = word_model.evaluate(
    X_test_word, y_test_word, verbose=0
)
print(f"Word-level Test Accuracy: {test_acc_word:.4f}")


Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.SFJ0YO_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.SFJ0YO_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.SFJ0YO_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.
Train samples: 20000
Validation samples: 5000
Test samples: 25000
Character-level vocab size: 141
X_train_char shape: (20000, 141)
Word-level vocab size (limit=5000): 80305
X_train_word shape: (20000, 5000)

--- CHAR-LEVEL TRAINING ---
Epoch 1/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.5609 - loss: 0.6818 - val_accuracy: 0.5712 - val_loss: 0.6782
Epoch 2/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6015 - loss: 0.6619 - val_accuracy: 0.5928 - val_loss: 0.6682
Epoch 3/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6030 - loss: 0.6588 - val_accuracy: 0.5996 - val_loss: 0.6662
Epoch 4/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6062 - loss: 0.6568 - val_accuracy: 0

In [7]:
import itertools

# Hyper-parameter grids
learning_rates = [0.001, 0.0005, 0.0001]
hidden_layer_configs = [
    [128],      # single hidden layer of 128
    [128, 64]   # two hidden layers: 128 -> 64
]
batch_sizes = [32, 64, 128]
optimizers = ['Adam', 'SGD', 'RMSProp']
activations = ['relu', 'tanh', 'leaky_relu']

best_val_acc = 0.0  # initialize to float
best_config = None

for lr in learning_rates:
    for hl in hidden_layer_configs:
        for bs in batch_sizes:
            for opt in optimizers:
                for act in activations:
                    print(f"Testing: LR={lr}, HL={hl}, BS={bs}, OPT={opt}, ACT={act}")

                    # Build the model
                    model = build_mlp(
                        input_dim_char,   # or input_dim_word if doing word-level
                        hidden_sizes=hl,
                        output_dim=2,
                        activation=act
                    )

                    # Compile the model
                    model = compile_model(model, lr, opt)

                    # Train the model
                    history = model.fit(
                        X_train_char, y_train_char,   # or X_train_word, y_train_word
                        validation_data=(X_val_char, y_val_char),  # or word-level data
                        epochs=5,       # short run for quick testing
                        batch_size=bs,
                        verbose=0       # set to 1 if you want progress
                    )

                    # Get final validation accuracy of these 3 epochs
                    final_val_acc = history.history['val_accuracy'][-1]

                    # If this is the best so far, record it
                    if final_val_acc > best_val_acc:
                        best_val_acc = final_val_acc
                        best_config = (lr, hl, bs, opt, act)

print("Best char-level config:", best_config, "with val acc:", best_val_acc)

best_val_acc_word = 0.0
best_config_word = None

for lr in learning_rates:
    for hl in hidden_layer_configs:
        for bs in batch_sizes:
            for opt in optimizers:
                for act in activations:
                    print(f"Testing (Word-Level): LR={lr}, HL={hl}, BS={bs}, OPT={opt}, ACT={act}")

                    model = build_mlp(
                        input_dim_word,
                        hidden_sizes=hl,
                        output_dim=2,
                        activation=act
                    )
                    model = compile_model(model, lr, opt)

                    history = model.fit(
                        X_train_word, y_train_word,
                        validation_data=(X_val_word, y_val_word),
                        epochs=5,
                        batch_size=bs,
                        verbose=0
                    )

                    final_val_acc = history.history['val_accuracy'][-1]
                    if final_val_acc > best_val_acc_word:
                        best_val_acc_word = final_val_acc
                        best_config_word = (lr, hl, bs, opt, act)

print("Best word-level config:", best_config_word, "with val acc:", best_val_acc_word)

Testing: LR=0.001, HL=[128], BS=32, OPT=Adam, ACT=relu
Testing: LR=0.001, HL=[128], BS=32, OPT=Adam, ACT=tanh
Testing: LR=0.001, HL=[128], BS=32, OPT=Adam, ACT=leaky_relu
Testing: LR=0.001, HL=[128], BS=32, OPT=SGD, ACT=relu
Testing: LR=0.001, HL=[128], BS=32, OPT=SGD, ACT=tanh
Testing: LR=0.001, HL=[128], BS=32, OPT=SGD, ACT=leaky_relu
Testing: LR=0.001, HL=[128], BS=32, OPT=RMSProp, ACT=relu
Testing: LR=0.001, HL=[128], BS=32, OPT=RMSProp, ACT=tanh
Testing: LR=0.001, HL=[128], BS=32, OPT=RMSProp, ACT=leaky_relu
Testing: LR=0.001, HL=[128], BS=64, OPT=Adam, ACT=relu
Testing: LR=0.001, HL=[128], BS=64, OPT=Adam, ACT=tanh
Testing: LR=0.001, HL=[128], BS=64, OPT=Adam, ACT=leaky_relu
Testing: LR=0.001, HL=[128], BS=64, OPT=SGD, ACT=relu
Testing: LR=0.001, HL=[128], BS=64, OPT=SGD, ACT=tanh
Testing: LR=0.001, HL=[128], BS=64, OPT=SGD, ACT=leaky_relu
Testing: LR=0.001, HL=[128], BS=64, OPT=RMSProp, ACT=relu
Testing: LR=0.001, HL=[128], BS=64, OPT=RMSProp, ACT=tanh
Testing: LR=0.001, HL=[128