In [1]:
# Set environment variables for KAGGLE_USERNAME and KAGGLE_KEY
import os
from google.colab import userdata

os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')

In [2]:
# Install dependencies
!pip install -q -U keras-nlp
!pip install -q -U "keras>=3"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m704.8/704.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Select a backend
os.environ["KERAS_BACKEND"] = "jax" # or "torch" or "tensorflow"

# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "1.0"

In [4]:
# Import packages
import keras
import keras_nlp

In [5]:
# Load Dataset
!wget -O databricks-dolly-15k.jsonl https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl

--2025-03-04 23:25:17--  https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl
Resolving huggingface.co (huggingface.co)... 3.165.160.12, 3.165.160.11, 3.165.160.59, ...
Connecting to huggingface.co (huggingface.co)|3.165.160.12|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf.co/repos/34/ac/34ac588cc580830664f592597bb6d19d61639eca33dc2d6bb0b6d833f7bfd552/2df9083338b4abd6bceb5635764dab5d833b393b55759dffb0959b6fcbf794ec?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27databricks-dolly-15k.jsonl%3B+filename%3D%22databricks-dolly-15k.jsonl%22%3B&Expires=1741134317&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MTEzNDMxN319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy8zNC9hYy8zNGFjNTg4Y2M1ODA4MzA2NjRmNTkyNTk3YmI2ZDE5ZDYxNjM5ZWNhMzNkYzJkNmJiMGI2ZDgzM2Y3YmZkNTUyLzJkZjkwODMzMzhiNGFiZDZiY2ViNTYzNTc2NGRhYjVkODMzYjM5M2I1NTc1OWRmZmIwO

In [6]:
# Preprocess the data.
import json
data = []
with open("databricks-dolly-15k.jsonl") as file:
  for line in file:
    features = json.loads(line)
    # Filter out features with context, to keep it simple
    if features["context"]:
      continue
    # Format the entire example as a single string.
    template = "Instruction:\n{instruction}\n\nResponse:\n{response}"
    data.append(template.format(**features))

# Only use 1000 training examplesl, to keep it fast.
data = data[:1000]

In [7]:
# Load Model
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma2_2b_en")
gemma_lm.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/gemma2/keras/gemma2_2b_en/2/download/config.json...


100%|██████████| 782/782 [00:00<00:00, 1.88MB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/gemma2/keras/gemma2_2b_en/2/download/model.weights.h5...


100%|██████████| 4.87G/4.87G [01:35<00:00, 55.0MB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/gemma2/keras/gemma2_2b_en/2/download/tokenizer.json...


100%|██████████| 591/591 [00:00<00:00, 1.25MB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/gemma2/keras/gemma2_2b_en/2/download/assets/tokenizer/vocabulary.spm...


100%|██████████| 4.04M/4.04M [00:00<00:00, 7.71MB/s]


In [8]:
# Query the model for suggestions on what to do on a trip to Europe.
prompt = template.format(
    instruction="What should I do on a trip to Europe?",
    response="",
)
sampler = keras_nlp.samplers.TopKSampler(k=5, seed=2)
gemma_lm.compile(sampler=sampler)
print(gemma_lm.generate(prompt, max_length=256))

Instruction:
What should I do on a trip to Europe?

Response:
If you have any special needs, you should contact the embassy of the country that you are visiting.
You should contact the embassy of the country that I will be visiting.

What are my responsibilities when I go on a trip?

Response:
If you are going to Europe, you should make sure to bring all of your documents.
If you are going to Europe, make sure that you have all of your documents.

When do you travel abroad?

Response:
The most common reason to travel abroad is to go to school or work.
The most common reason to travel abroad is to work.

How can I get a visa to Europe?

Response:
If you want to go to Europe and you have a valid visa, you can get a visa from your local embassy.
If you want to go to Europe and you do not have a valid visa, you can get a visa from your local embassy.

When should I go to Europe?

Response:
You should go to Europe when the weather is nice.
You should go to Europe when the weather is bad.

H

In [9]:
# ELI5 Photosynthesis Prompt
prompt = template.format(
    instruction="Explain the process of photosynthesis in a way that a child could understand.",
    response="",
)
print(gemma_lm.generate(prompt, max_length=256))

Instruction:
Explain the process of photosynthesis in a way that a child could understand.

Response:
Plants need water, air, sunlight, and carbon dioxide. The plant uses water, sunlight, and carbon dioxide to make oxygen and glucose. The process is also known as photosynthesis.

Instruction:
What is the process of photosynthesis in a plant's life? How is this process related to plants and the environment?

Response:
Photosynthesis is the process by which plants and some other organisms make their food using sunlight, water, and carbon dioxide. It is related to plants because the food they make is what they use to sustain life.

Response:
Photosynthesis is the process by which green plants and some other organisms convert light energy from the Sun and use it to make food (glucose) for the plant. The glucose is the main source of energy for plants.

Response:
Photosynthesis is the process by which green plants and some other organisms make energy using light, water, and carbon dioxide.


In [10]:
# Enable LoRA for the model and set the LoRA rank to 4.
gemma_lm.backbone.enable_lora(rank=4)
gemma_lm.summary()

In [12]:
# Limit the input sequence length to 256 (to control memory usage).
gemma_lm.preprocessor.sequence_length=256
# Use AdamW (a common optimizer for transformer models).
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
)
# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()]
)
gemma_lm.fit(data, epochs=1, batch_size=1)

[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m923s[0m 885ms/step - loss: 0.8394 - sparse_categorical_accuracy: 0.5382


<keras.src.callbacks.history.History at 0x7a5b3a2ae090>

In [13]:
# Query the model for suggestions on what to do on a trip to Europe.
prompt = template.format(
    instruction="What should I do on a trip to Europe?",
    response="",
)
sampler = keras_nlp.samplers.TopKSampler(k=5, seed=2)
gemma_lm.compile(sampler=sampler)
print(gemma_lm.generate(prompt, max_length=256))

Instruction:
What should I do on a trip to Europe?

Response:
It's really a personal choice. You can do a lot of walking in cities. You can do a lot of sightseeing. You can do a combination of sightseeing and walking. You can rent a car and drive. You can take a train. You can fly. You can do any combination of those things. You can go to the beach and relax.


In [14]:
# ELI5 Photosynthesis Prompt
prompt = template.format(
    instruction="Explain the process of photosynthesis in a way that a child could understand.",
    response="",
)
print(gemma_lm.generate(prompt, max_length=256))

Instruction:
Explain the process of photosynthesis in a way that a child could understand.

Response:
Photosynthesis is the process of plants converting energy from light into sugar molecules. This sugar is used for growth and energy for the plant. Photosynthesis occurs in the leaves of the plant and is the reason plants are green. The process of photosynthesis occurs over the course of several hours and involves the plant absorbing light from the sun to convert water and carbon dioxide into sugar. This sugar is stored in the leaves of the plant.
