In [1]:
# Microsoft's Phi 3.5 - The latest SLMs
# Sam Witteveen
# https://www.youtube.com/watch?v=wVuwTeW4Ow4

# GitHub - Phi-3CookBook
# https://github.com/microsoft/Phi-3CookBook

# List of Phi-3 models on HuggingFace
# https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3

# Deepseek RAG w Self-Cons + FAISS + Sent Transfmers
# https://www.kaggle.com/code/vbookshelf/deepseek-rag-w-self-cons-faiss-sent-transfmers/notebook

# KaggleBot - Gemma-7b-it RAG w few-shot prompting
# https://www.kaggle.com/code/vbookshelf/kagglebot-gemma-7b-it-rag-w-few-shot-prompting



In [2]:
!pip -q install accelerate
# !pip -q install --force-reinstall /content/transformers-4.42.0.dev0-py3-none-any.whl
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip install -q sentencepiece tokenizer

In [3]:
import pandas as pd
import numpy as np
import os

import gc
import re
from tqdm import tqdm


import torch
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          #BitsAndBytesConfig, 
                          AutoConfig)

# Don't Show Warning Messages
import warnings
warnings.filterwarnings('ignore')

print(f"CUDA Version: {torch.version.cuda}")
print(f"Pytorch {torch.__version__}")

CUDA Version: 12.3
Pytorch 2.4.0


## Define the device

In [4]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print(f"Device: {DEVICE}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"Pytorch {torch.__version__}")

Device: cuda
CUDA Version: 12.3
Pytorch 2.4.0


In [5]:
# Check the type and quantity of GPUs

if torch.cuda.is_available():
    print('Num CPUs:', os.cpu_count())
    print('Num GPUs:', torch.cuda.device_count())
    print('GPU Type:', torch.cuda.get_device_name(0))

Num CPUs: 4
Num GPUs: 2
GPU Type: Tesla T4


## Helper functions

In [6]:
import textwrap

def wrap_text(text, width=90): #preserve_newlines
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

## Generate text

In [7]:
from IPython.display import Markdown, display

display(Markdown("## Phi-3-mini-4k-instruct "))

## Phi-3-mini-4k-instruct 

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

MODEL_PATH = "microsoft/Phi-3-mini-4k-instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

prompt = "<|system|>You are a helpful assistant.<|end|><|user|>Who is Yoda?<|end|><|assistant|>"

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt")

# Send the inputs to the device
inputs = inputs.to(DEVICE)

# Generate the outputs from prompt
generate_ids = model.generate(**inputs, 
                              max_new_tokens=768,
                              do_sample=True,
                              temperature=0.1,
                              top_k=50)

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [9]:
# Decode the generated output
generated_text = tokenizer.batch_decode(generate_ids, 
                                    skip_special_tokens=False,
                                    clean_up_tokenization_spaces=False) #[0]

generated_text

['<|system|> You are a helpful assistant.<|end|><|user|> Who is Yoda?<|end|><|assistant|> Yoda is a fictional character from the Star Wars franchise. He is a wise and powerful Jedi Master known for his distinctive way of speaking, often using reversed sentence structures. Yoda is a Sith Lord who was once a member of the Jedi Order but turned to the dark side. He is the mentor to characters like Luke Skywalker and Anakin Skywalker (who later became Darth Vader). Yoda is known for his unique appearance, with a small stature, large head, and a distinctive voice.<|end|><|endoftext|>']

In [10]:
# Decode the generated output
generated_text = tokenizer.batch_decode(generate_ids, 
                                    skip_special_tokens=False,
                                    clean_up_tokenization_spaces=False)[0]

generated_text

'<|system|> You are a helpful assistant.<|end|><|user|> Who is Yoda?<|end|><|assistant|> Yoda is a fictional character from the Star Wars franchise. He is a wise and powerful Jedi Master known for his distinctive way of speaking, often using reversed sentence structures. Yoda is a Sith Lord who was once a member of the Jedi Order but turned to the dark side. He is the mentor to characters like Luke Skywalker and Anakin Skywalker (who later became Darth Vader). Yoda is known for his unique appearance, with a small stature, large head, and a distinctive voice.<|end|><|endoftext|>'

In [11]:
# Extract the answer

# Split
response = generated_text.split('<|assistant|>')[1]

# Remove leading and trailing spaces
response = response.strip()

# Remove the end token
response = response.replace('<|end|><|endoftext|>', "")

response

'Yoda is a fictional character from the Star Wars franchise. He is a wise and powerful Jedi Master known for his distinctive way of speaking, often using reversed sentence structures. Yoda is a Sith Lord who was once a member of the Jedi Order but turned to the dark side. He is the mentor to characters like Luke Skywalker and Anakin Skywalker (who later became Darth Vader). Yoda is known for his unique appearance, with a small stature, large head, and a distinctive voice.'

In [12]:
wrapped_text = wrap_text(response)
#print(wrapped_text)

display(Markdown(wrapped_text))

Yoda is a fictional character from the Star Wars franchise. He is a wise and powerful Jedi
Master known for his distinctive way of speaking, often using reversed sentence
structures. Yoda is a Sith Lord who was once a member of the Jedi Order but turned to the
dark side. He is the mentor to characters like Luke Skywalker and Anakin Skywalker (who
later became Darth Vader). Yoda is known for his unique appearance, with a small stature,
large head, and a distinctive voice.

## Put everything into a function

In [13]:
def run_slm(user_query, system_message):
    
    # Create the prompt template
    prompt = f"<|system|>{system_message}<|end|><|user|>{user_query}<|end|><|assistant|>"

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")

    # Send the inputs to the device
    inputs = inputs.to(DEVICE)

    # Generate the outputs from prompt
    generate_ids = model.generate(**inputs, 
                                  max_new_tokens=768,
                                  do_sample=True,
                                  temperature=0.1,
                                  top_k=50)
    
    # Decode the generated output
    generated_text = tokenizer.batch_decode(generate_ids, 
                                        skip_special_tokens=False,
                                        clean_up_tokenization_spaces=False)[0]
    
    
    # Extract the answer
    # -------------------

    # Split
    response = generated_text.split('<|assistant|>')[1]

    # Remove leading and trailing spaces
    response = response.strip()

    # Remove the end token
    response = response.replace('<|end|><|endoftext|>', "")
    
    
    # Display the response
    # ---------------------
    
    wrapped_text = wrap_text(response)
    #print(wrapped_text)

    display(Markdown(wrapped_text))
    

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Initialize the model and tokenizer

MODEL_PATH = "microsoft/Phi-3-mini-4k-instruct"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)


configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
system_message = "You are a helpful assistant. Your name is Agatha."

user_query = "Hi. What's your name?"

run_slm(user_query, system_message)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


Hello! I'm Agatha. How can I help you today?

In [16]:
system_message = "You are a helpful assistant. Your name is Agatha."

user_query = "Please list three benefits of Minimalism"

run_slm(user_query, system_message)

1. Reduced clutter: Minimalism helps in decluttering your living space, making it more
organized and visually appealing. This can lead to a calmer and more peaceful environment.

2. Increased focus and productivity: With fewer distractions and less physical clutter,
minimalism can help you focus on your priorities and tasks, leading to increased
productivity and efficiency.

3. Financial savings: By owning fewer possessions, minimalism can help you save money on
purchases, maintenance, and storage. This can lead to a more sustainable and financially
stable lifestyle.

In [17]:
system_message = "You are emulating Captain Jack Sparrow."

user_query = "Hello"

run_slm(user_query, system_message)

Hello, matey! What can this old sea dog assist ye with today?