<a href="https://colab.research.google.com/github/twyeh/AI-in-education/blob/main/Transformer_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [202]:
!pip install requests bs4



In [203]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import requests
from bs4 import BeautifulSoup
import re
import numpy

In [204]:
# Web scraping configuration
BASE_URLS = ["https://en.wikipedia.org/wiki/Artificial_intelligence",
             "https://www.nytimes.com/section/technology",
             "https://en.wikipedia.org/wiki/Transformer"]  # Example URLs
VOCAB_SIZE = 30000
MAX_LEN = 200
EMBED_DIM = 128
NUM_HEADS = 4
FF_DIM = 256
BATCH_SIZE = 64
EPOCHS = 100

In [205]:
def scrape_article(url):
    """Scrape and clean article text from a webpage"""
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove non-content elements
        for element in soup(['script', 'style', 'nav', 'footer']):
            element.decompose()

        # Extract and clean text
        text = ' '.join([p.get_text() for p in soup.find_all('p')])
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

In [206]:
# Scrape training data from multiple sources
print("Scraping training data...")
corpus = []
for url in BASE_URLS:
    article = scrape_article(url)
    if article:
        corpus.append(article)
#corpus

Scraping training data...


In [207]:
# Create text vectorization layer
text_vectorization = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1  # +1 for target sequence
)

In [208]:
# Prepare dataset
text_dataset = tf.data.Dataset.from_tensor_slices(corpus)
text_vectorization.adapt(text_dataset.batch(64))


In [209]:
def prepare_lm_dataset(text_batch):
    text_vectorized = text_vectorization(text_batch)
    return text_vectorized[:, :-1], text_vectorized[:, 1:]  # Input and target

In [210]:
dataset = text_dataset \
    .batch(BATCH_SIZE) \
    .map(prepare_lm_dataset, num_parallel_calls=tf.data.AUTOTUNE) \
    .prefetch(tf.data.AUTOTUNE)

In [211]:
# Transformer components (same as original)
class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, embed_dim):
        super().__init__()
        self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)

    def call(self, x):
        positions = tf.range(start=0, limit=tf.shape(x)[1], delta=1)
        positions = self.pos_emb(positions)
        return x + positions

In [212]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [213]:
# Build language model
inputs = layers.Input(shape=(MAX_LEN,))
x = layers.Embedding(VOCAB_SIZE, EMBED_DIM)(inputs)
x = PositionalEncoding(MAX_LEN, EMBED_DIM)(x)
x = TransformerBlock(EMBED_DIM, NUM_HEADS, FF_DIM)(x, training=False) # Pass training=False here
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

In [214]:
# Train model
history = model.fit(
    dataset,
    epochs=EPOCHS
)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13s/step - accuracy: 0.0000e+00 - loss: 10.3095
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0117 - loss: 10.1687
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.1617 - loss: 10.0174
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.3517 - loss: 9.8502
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 909ms/step - accuracy: 0.4600 - loss: 9.6675
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 878ms/step - accuracy: 0.4717 - loss: 9.4747
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.4350 - loss: 9.2783
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 906ms/step - accuracy: 0.3550 - loss: 9.0789
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [224]:
# Generate text example
def generate_text(prompt, length=50):
    output_string={}
    tokens = text_vectorization([prompt]).numpy()[0]
    generated_tokens = []  # Store generated word tokens
    for _ in range(length):
        pred = model.predict(tokens[-MAX_LEN:].reshape(1, -1), verbose=0)
        next_token = tf.argmax(pred[0, -1, :]).numpy()

        # Check if next_token represents a word (alphanumeric)
        token_string = text_vectorization.get_vocabulary()[next_token.item()]
        if token_string.isalpha():  # Check if token is alphanumeric
            generated_tokens.append(next_token)

        tokens = numpy.append(tokens, next_token.item())

    # Convert generated tokens back to words
    generated_words = [text_vectorization.get_vocabulary()[token] for token in generated_tokens]
    output_string = ' '.join(generated_words)
    return output_string
   # return generated_words

In [223]:
generate_text('a transformer is a passive component', length=50)

[np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons'),
 np.str_('tons')]