In [1]:
# Setup the environment
#!pip install --upgrade huggingface_hub transformers
"""
!pip install bitsandbytes
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
access_token_read = UserSecretsClient().get_secret("HUGGINGFACE_TOKEN")
login(token = access_token_read)
"""
#!pip install git+https://github.com/huggingface/transformers -U
#!pip install accelerate
#!pip install -i https://pypi.org/simple/ bitsandbytes

'\n!pip install bitsandbytes\nfrom huggingface_hub import login\nfrom kaggle_secrets import UserSecretsClient\naccess_token_read = UserSecretsClient().get_secret("HUGGINGFACE_TOKEN")\nlogin(token = access_token_read)\n'

In [2]:
import torch, os
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
# Work around a bug in the version of PyTorch and GPU hardware curretnly on Kaggle. On other hardware, removing these lines may lead to a speed-up.
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

# Load the model
USE_INSTRUCTION_TUNED = True
if USE_INSTRUCTION_TUNED:
    model_name = '/kaggle/input/gemma/transformers/1.1-2b-it/1'
    if not os.path.exists(model_name):
        print("Warning: loading model weights from the Internet. This might take a bit of extra time.")
        model_name = "google/gemma-1.1-2b-it"
else:
    model_name = "/kaggle/input/gemma/transformers/2b/2"
    if not os.path.exists(model_name):
        print("Warning: loading model weights from the Internet. This might take a bit of extra time.")
        model_name = "google/gemma-2b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    torch_dtype=torch.bfloat16)
streamer = TextStreamer(tokenizer)
# Silence a warning.
tokenizer.decode([tokenizer.eos_token_id]);

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2024-04-17 07:34:32.943794: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-17 07:34:32.943905: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-17 07:34:33.065057: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Check where the whole model is loaded and what data type it's using.
model.device, model.dtype

(device(type='cuda', index=0), torch.bfloat16)

In [3]:
# Check where parameters are loaded. If this is anything other than {'': 0}
# then probably some parts of the model got offloaded onto CPU and so will run slow.
model.hf_device_map

{'': 0}

## Warm-Up

In [4]:
%%time
doc = '''The capital of France is'''
model_out = model.generate(
    **tokenizer(doc, return_tensors='pt').to(model.device),
    max_new_tokens=128,
    do_sample=False,
    streamer=streamer)

<bos>The capital of France is...

A. Paris
B. Lyon
C. Marseille
D. Bordeaux

The answer is A.

The capital of France is Paris. It is a major city and the political, economic, and cultural center of France.<eos>
CPU times: user 2.87 s, sys: 187 ms, total: 3.06 s
Wall time: 3.55 s


In [5]:
%%time
doc = '''Expression: 2 + 2. Result:'''
model_out = model.generate(
    **tokenizer(doc, return_tensors='pt').to(model.device),
    max_new_tokens=128,
    do_sample=False,
    streamer=streamer)

<bos>Expression: 2 + 2. Result: 4

This is a simple mathematical expression that demonstrates the basic operations of addition.

**Questions:**

1. What are the different types of mathematical expressions?
2. How are mathematical expressions used in real life?
3. What are the benefits of using mathematical expressions?

**Additional Information:**

* Mathematical expressions can include numbers, variables, operators, and parentheses.
* Mathematical expressions are used to represent and solve mathematical problems.
* Mathematical expressions can be used in various fields, including science, engineering, finance, and education.

**Answers:**

**1. Different types of mathematical expressions:**

- Algebraic
CPU times: user 3.98 s, sys: 14.9 ms, total: 4 s
Wall time: 3.97 s


In [6]:
%%time
doc = '''An expert Spanish translation of 'Language models are statistical models that can generate text.' is'''
model_out = model.generate(
    **tokenizer(doc, return_tensors='pt').to(model.device),
    max_new_tokens=128,
    do_sample=False,
    streamer=streamer)

<bos>An expert Spanish translation of 'Language models are statistical models that can generate text.' is:

* Modelo estadístico de generación de texto
* Modelo de traducción estadística
* Modelo de procesamiento del lenguaje estadístico
* Modelo de análisis de datos estadísticos

Can anyone please explain the difference between these options?

I understand that "Modelo estadístico de generación de texto" and "Modelo de procesamiento del lenguaje estadístico" are related to statistical language models. But I'm not sure about the other two options.

Could someone please explain the difference between these two options and provide additional context if necessary?<eos>
CPU times: user 3.28 s, sys: 23.7 ms, total: 3.31 s
Wall time: 3.29 s


In [7]:
%%time
doc = '''Please write me a function, called sum_evens, in python that takes in a list of number and outputs the sum of the even numbers in the list'''
model_out = model.generate(
    **tokenizer(doc, return_tensors='pt').to(model.device),
    max_new_tokens=128,
    do_sample=False,
    streamer=streamer)

<bos>Please write me a function, called sum_evens, in python that takes in a list of number and outputs the sum of the even numbers in the list.

```python
def sum_evens(list1):
    sum = 0
    for num in list1:
        if num % 2 == 0:
            sum += num
    return sum
```

**Example Usage:**

```python
list1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

result = sum_evens(list1)

print(result)  # Output: 30
```

**Explanation:**

* The function takes in
CPU times: user 3.83 s, sys: 17.8 ms, total: 3.85 s
Wall time: 3.83 s


# Answer Questions Below

1. Write a brief summary of how well Gemma performed on each task
- USE_INSTRUCTION_TUNED = True
  - r
- USE_INSTRUCTION_TUNED = False
  - r
2. Notice that we expressed these tasks in the form of making reasonable completions of a document, not giving the model instructions. What implications does this have for how useful a language model is?
- By expressing these tasks in the form of documents rather than instructions, we are able to get more out of the model's ability to generate coherent text based on context. This shows the model's ability to understand and produce human-like language.
3. Notice that the model took some times to generate the first token, then generated the rest of the text more quickly(but still not instantly). Why do you think that is?
- I think that the model took some time to generate the first token, then generated the rest of the text more quickly because it needs a little bit of time ot set up its processes. Once the model gets starts it's internal processes, it has the ability to generate the rest of the text more quickly becasue it's in a state where it can understand and process the input more efficiently

# Few-Shot Learning

In [8]:
%%time
doc = '''The capital of Michigan is Lansing. The capital of England is London. The capital of France is'''
model_out = model.generate(
    **tokenizer(doc, return_tensors='pt').to(model.device),
    max_new_tokens=128,
    do_sample=False,
    streamer=streamer)

<bos>The capital of Michigan is Lansing. The capital of England is London. The capital of France is Paris.

Which of these is the odd one out?

A) Lansing
B) London
C) Paris

The answer is B) London.

London is the capital of England, not Michigan.<eos>
CPU times: user 1.56 s, sys: 13.4 ms, total: 1.58 s
Wall time: 1.57 s


In [9]:
%%time
doc= '''
Request: capital("Michigan")
Response: "Lansing"
Request: capital("England")
Response: capital("London")
Request: capital("France")
Response:'''
model_out = model.generate(
    **tokenizer(doc, return_tensors='pt').to(model.device),
    max_new_tokens=128,
    do_sample=False,
    streamer=streamer)

<bos>
Request: capital("Michigan")
Response: "Lansing"
Request: capital("England")
Response: capital("London")
Request: capital("France")
Response: capital("Paris"

I understand that the provided code is for finding the capital of a country based on the user's input. However, the responses provided are not the actual capitals of the countries.

To fix this, we need to retrieve the actual capital names from reliable sources like official websites or reputable news organizations.

**Modified Code:**

```python
def get_capital(country):
    """
    Returns the capital of a country.

    Args:
        country: The name of the country.

    Returns:
        The capital of the country.
    """

    # Retrieve the capital
CPU times: user 4.18 s, sys: 30.5 ms, total: 4.21 s
Wall time: 4.18 s


# Answer Question
Write a brief summary of how Gemma performed on this taks, as compared with not giving it any examples

# Chain of Thought

In [10]:
%%time
doc = '''I went to the market and bought 10 apples. I gave 2 apples to the neighbor and 2 to the repairman. I then went and bought 5 more apples and ate 1. How many apples did I remain with?'''
model_out = model.generate(
    **tokenizer(doc, return_tensors='pt').to(model.device),
    max_new_tokens=128,
    do_sample=False,
    streamer=streamer)

<bos>I went to the market and bought 10 apples. I gave 2 apples to the neighbor and 2 to the repairman. I then went and bought 5 more apples and ate 1. How many apples did I remain with?

**Answer:** 4 apples

When you give away some apples, you don't actually receive them. You simply reduce the number you have.<eos>
CPU times: user 1.16 s, sys: 9.49 ms, total: 1.17 s
Wall time: 1.16 s


# Answer Question
What does Gemma predict?

In [11]:
%%time
doc = '''I went to the market and bought 10 apples. I gave 2 apples to the neighbor and 2 to the repairman. I then went and bought 5 more apples and ate 1. How many apples did I remain with? Let's think step by step. After I bought the apples, I had'''
model_out = model.generate(
    **tokenizer(doc, return_tensors='pt').to(model.device),
    max_new_tokens=128,
    do_sample=False,
    streamer=streamer)

<bos>I went to the market and bought 10 apples. I gave 2 apples to the neighbor and 2 to the repairman. I then went and bought 5 more apples and ate 1. How many apples did I remain with? Let's think step by step. After I bought the apples, I had 10 apples. After I gave away 2 and gave to others, I had 8 apples. Then, I bought 5 more and ate 1, leaving me with 7 apples.<eos>
CPU times: user 1.49 s, sys: 3.68 ms, total: 1.5 s
Wall time: 1.49 s


# Answer Question
* How does the generated text change?

## Chat Templating

In [19]:
role = """You are a helpful 2nd-grade teacher. Help a 2nd grader to answer questions in a short and clear manner."""
task = """Explain why the sky is blue"""

messages = [
    {
        "role": "user",
        "content": f"{role}\n\n{task}",
    },
 ]
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
print(tokenizer.batch_decode(tokenized_chat)[0])

<bos><start_of_turn>user
You are a helpful 2nd-grade teacher. Help a 2nd grader to answer questions in a short and clear manner.

Explain why the sky is blue<end_of_turn>
<start_of_turn>model



In [21]:
%%time
doc = tokenizer.batch_decode(tokenized_chat)[0]
model_out = model.generate(
    **tokenizer(doc, return_tensors='pt').to(model.device),
    max_new_tokens=128,
    do_sample=False,
    streamer=streamer)

<bos><bos><start_of_turn>user
You are a helpful 2nd-grade teacher. Help a 2nd grader to answer questions in a short and clear manner.

Explain why the sky is blue<end_of_turn>
<start_of_turn>model
The sky is blue because of a special type of particle called **cloud droplets**.

When sunlight enters a cloud, the blue light is scattered more than other colors because it has a shorter wavelength. This means that when we look up at the sky, we see more blue light than any other color.<eos>
CPU times: user 1.89 s, sys: 18.7 ms, total: 1.91 s
Wall time: 1.9 s


## Retrieval-Augmented Generation

In [13]:
import inspect
docstrings = {}
for name, obj in inspect.getmembers(torch.nn):
    if inspect.isfunction(obj) or inspect.isclass(obj):
        docstrings[name] = inspect.getdoc(obj)


In [14]:
docstrings.keys()

dict_keys(['AdaptiveAvgPool1d', 'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d', 'AdaptiveLogSoftmaxWithLoss', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AlphaDropout', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'BCELoss', 'BCEWithLogitsLoss', 'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'Bilinear', 'CELU', 'CTCLoss', 'ChannelShuffle', 'CircularPad1d', 'CircularPad2d', 'CircularPad3d', 'ConstantPad1d', 'ConstantPad2d', 'ConstantPad3d', 'Container', 'Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d', 'CosineEmbeddingLoss', 'CosineSimilarity', 'CrossEntropyLoss', 'CrossMapLRN2d', 'DataParallel', 'Dropout', 'Dropout1d', 'Dropout2d', 'Dropout3d', 'ELU', 'Embedding', 'EmbeddingBag', 'FeatureAlphaDropout', 'Flatten', 'Fold', 'FractionalMaxPool2d', 'FractionalMaxPool3d', 'GELU', 'GLU', 'GRU', 'GRUCell', 'GaussianNLLLoss', 'GroupNorm', 'Hardshrink', 'Hardsigmoid', 'Hardswish', 'Hardtanh', 'HingeEmbeddingLoss', 'HuberLoss', 'Identity', 'InstanceNorm1

In [26]:
%%time

# Define the specific task/question
task = "Explain the options for TripletMarginLoss in PyTorch."

# Function to retrieve the specific context from the docstrings based on the task
def retrieve_context(query):
    relevant_context = []
    for name, doc in docstrings.items():
        if query.lower() in name.lower():
            relevant_context.append(f"{name}: {doc}")
    return ' '.join(relevant_context)  # Combine all relevant docstrings into a single string
incorrect_context = docstrings['CrossEntropyLoss']

# Retrieve the context for the given task
# Format the document to be used as input to the model
doc = f'''{task}

Answer using the following context:
{context}'''

# Generate a response from the model
model_out = model.generate(
    **tokenizer(doc, return_tensors='pt').to(model.device),
    max_new_tokens=128,
    do_sample=False
)

# Decode the generated tokens to text
response = tokenizer.decode(model_out[0], skip_special_tokens=True)

# Print the response
print(response)


Explain the options for TripletMarginLoss in PyTorch.

Answer using the following context:
TripletMarginLoss: Creates a criterion that measures the triplet loss given an input
tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
This is used for measuring a relative similarity between samples. A triplet
is composed by `a`, `p` and `n` (i.e., `anchor`, `positive examples` and `negative
examples` respectively). The shapes of all input tensors should be
:math:`(N, D)`.

The distance swap is described in detail in the paper `Learning shallow
convolutional feature descriptors with triplet losses`_ by
V. Balntas, E. Riba et al.

The loss function for each sample in the mini-batch is:

.. math::
    L(a, p, n) = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}


where

.. math::
    d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p

The norm is calculated using the specified p value and a small constant :math:`\varepsilon` is
added for 