<a href="https://colab.research.google.com/github/trbndev/wikibert/blob/main/wikibert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install tensorflow



In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import os

In [None]:
# Load the text data
file_path = './all_articles.txt'
text = open(file_path, 'r', encoding='utf-8').read()

# Convert text to lowercase
text = text.lower()

# Create a sorted list of unique characters
vocab = sorted(set(text))

# Create mappings from characters to indices and vice versa
char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = np.array(vocab)

# Convert the text to a sequence of integers
text_as_int = np.array([char2idx[char] for char in text])

In [None]:
# Set the length of sequences for input
seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)

# Create training sequences
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

# Group sequences
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

# Function to split input and target
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

# Apply the function to sequences
dataset = sequences.map(split_input_target)

In [None]:
# Batch size
BATCH_SIZE = 64

# Buffer size for shuffling
BUFFER_SIZE = 10000

# Create training batches
data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
# Length of the vocabulary
vocab_size = len(vocab)

# Embedding dimensions and RNN units
embedding_dim = 256
rnn_units = 1024

# Build the model
# Build the model with an Input layer
model = tf.keras.Sequential([
    tf.keras.layers.Input(batch_shape=(BATCH_SIZE, None)),
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    tf.keras.layers.GRU(units=rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(units=vocab_size)
])

In [None]:
# Custom loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [None]:
# Compile the model
model.compile(optimizer='adam', loss=loss)

In [None]:
# Directory to save checkpoints
checkpoint_dir = './training_checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Checkpoint file prefix
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")

# Checkpoint callback
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

In [None]:
# Number of epochs for training
EPOCHS = 20

# Train the model
model.fit(data, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
[1m1879/1879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 70ms/step - loss: 1.9979
Epoch 2/20
[1m1879/1879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 71ms/step - loss: 1.2927
Epoch 3/20
[1m1879/1879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 71ms/step - loss: 1.2291
Epoch 4/20
[1m1879/1879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 71ms/step - loss: 1.1979
Epoch 5/20
[1m1879/1879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 71ms/step - loss: 1.1795
Epoch 6/20
[1m1879/1879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 71ms/step - loss: 1.1686
Epoch 7/20
[1m1879/1879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 72ms/step - loss: 1.1612
Epoch 8/20
[1m1879/1879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 71ms/step - loss: 1.1579
Epoch 9/20
[1m1879/1879[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 71ms/step - loss: 1.1566
Epoch 10/20
[1m1879/1879[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d17bc30b3d0>

In [None]:
# Rebuild the model with batch size of 1 for text generation
model = tf.keras.Sequential([
    tf.keras.layers.Input(batch_shape=[1, None]),
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    layers.GRU(
        units=rnn_units,
        return_sequences=True,
        stateful=True,
        recurrent_initializer='glorot_uniform'
    ),
    layers.Dense(units=vocab_size)
])

# Load the trained weights
best_checkpoint = os.path.join(checkpoint_dir, "ckpt_9.weights.h5")

model.load_weights(best_checkpoint)
model.build(tf.TensorShape([1, None]))

In [None]:
def generate_text(model, start_string, num_generate=1000):
    start_string = start_string.lower()
    # Convert the start string to numbers (vectorize)
    input_eval = [char2idx[char] for char in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty list to store generated text
    text_generated = []

    # Temperature for prediction diversity
    temperature = 0.7

    # Reset the states of the RNN layers
    # If you have multiple RNN layers, reset each one
    for layer in model.layers:
        if hasattr(layer, 'reset_states'):
            layer.reset_states()

    # Generate text
    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        # Apply temperature
        predictions = predictions / temperature

        # Predicted ID is sampled from the probability distribution
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        # Pass the predicted ID as the next input
        input_eval = tf.expand_dims([predicted_id], 0)

        # Append the predicted character to the generated text
        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)

In [None]:
# Start string for text generation
start_string = "Windows 8 was"

# Generate and print the text
generated_text = generate_text(model, start_string=start_string)
print(generated_text)

windows 8 was released in 1987 for development environments. it was introduced to note that the second alphabet is the oldest proceeds of ram in october 2007 with all of the sphere with a string list in common oriented language storage of a polyhedron is the form of syntax. since the result is the second schipping, a second sequence are typically available to a solaris or by control structures of a special line, such as a legal field by internet architecture and microsoft both installations, which are also an implementation of many other available versions of windows powers with updates to be ported to a single studio tables.

numerical and events and skills to the nature of a scaling a program within the x86 architecture of the test will be installed free safety for optional techniques used in components were also available to default variants or the program ased the classification of the fact that the first purpose of windows vista is also used as a security update to allow the progr