<a href="https://colab.research.google.com/github/vrangayyan6/GenAI/blob/main/DeepSeek_R1_Google_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hugging Face DeepSeek R1 grounded with Google Search results

Using DeepSeek-R1-Distill-Qwen-7B model from Hugging Face and 4-bit quantization

Reference:
- https://huggingface.co/deepseek-ai/DeepSeek-R1#deepseek-r1-distill-models
- https://colab.research.google.com/drive/1r9GOGEmjtZZbcCXjiwNHj0jsMFp0l8Ok?usp=sharing

In [1]:
# ### Install the needed libraries
# Avoiding versions mismatch
!pip install -q accelerate==0.29.3
!pip install -q bitsandbytes==0.43.1
!pip install -q trl==0.8.6
!pip install -q peft==0.10.0
!pip install -q transformers==4.40.0

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from google.colab import userdata
from pprint import pprint

In [3]:
# The model that you want to train from the Hugging Face hub
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"


################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type to use
bnb_4bit_quant_type = "nf4"

# Activate double quantization
use_nested_quant = False

In [4]:
device_map="auto"   # Automatically distribute layers across GPUs


# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [5]:
hf_token = userdata.get('HUGGING_FACE_TOKEN')  # add your huggingface token in Colab Secrets (left menu). Login to huggingface, and get your access token

# Load the model

In [6]:
# Load DeepSeek model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    use_auth_token=hf_token  # Pass your Hugging Face token here
)
model.config.use_cache = False
model.config.pretraining_tp = 1



config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [7]:
# Load Qwen tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                          use_auth_token=hf_token)

# Set them properly: Qwen
#tokenizer.pad_token = tokenizer.eos_token



tokenizer_config.json:   0%|          | 0.00/3.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
#this function returns the outputs from the model received, and inputs.
def get_outputs(model, inputs, max_new_tokens=32768):
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        repetition_penalty=1.1,
        early_stopping=True, #Can stop before reach the max_length
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=False
    )
    return outputs

In [9]:
def query_model(system, user_input, max_new_tokens = 32768):
  messages = [{"role": "system",
                "content": system},
                {"role": "user",
                "content": user_input}
                ]

  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

  #Inference original model
  model_input = tokenizer(prompt, return_tensors="pt").to('cuda')

  foundational_outputs_sentence = get_outputs(model, model_input, max_new_tokens=max_new_tokens)
  output = tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True)
  return output

# Your inputs

Provide the system prompt, your prompt and use Google search

In [10]:
system_prompt = "you are a helpful teacher"
user_prompt = "Explain what is Google transformer to a 16 year old."
use_google_search = True  # Set to False if you don't want to use Google search

# Get Google search results and prompt model

In [11]:
!pip install -q requests googlesearch-python

In [12]:
from googlesearch import search
import requests
from bs4 import BeautifulSoup


In [13]:
def get_search_results(query, num_results=5):
    results = []
    for j in search(query, num_results=num_results):
        results.append(j)
    return results

In [14]:
def get_webpage_content(url):
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()[:10000]  # Get first 10,000 characters
    except:
        return ""

In [15]:
def generate_grounded_content(system_prompt, prompt):
    # Get search results
    search_results = get_search_results(prompt)

    # Fetch content from search results
    search_contents = [f"Source {i+1}: {get_webpage_content(url)}" for i, url in enumerate(search_results)]

    # Combine prompt with search contents
    grounded_prompt = f"""
    Based on the following information, please answer the question or respond to the prompt:
    Question/Prompt: {prompt}

    Information from search:
    {' '.join(search_contents)}

    Please provide a response that incorporates information from these sources, and include citations in the format [Source X] where X is the source number.
    """

    # Query the model
    response = query_model(system_prompt, grounded_prompt)

    return response, search_results

In [16]:
if use_google_search:
  response, sources = generate_grounded_content(system_prompt, user_prompt)
  print("\nResponse:")
  pprint(response)

  print("\nSources:")
  for i, source in enumerate(sources, 1):
    print(f"[Source {i}] {source}")
else:
  response = query_model(system_prompt, user_prompt)
  print("\nResponse:")
  pprint(response)





Response:
['you are a helpful teacher<｜User｜>\n'
 '    Based on the following information, please answer the question or '
 'respond to the prompt:\n'
 '    Question/Prompt: Explain what is Google transformer to a 16 year old.\n'
 '\n'
 '    Information from search:\n'
 '    Source 1: \n'
 '\n'
 '\n'
 '\n'
 'Transformer (deep learning architecture) - Wikipedia\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 'Jump to content\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 'Main menu\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 'Main menu\n'
 'move to sidebar\n'
 'hide\n'
 '\n'
 '\n'
 '\n'
 '\t\tNavigation\n'
 '\t\n'
 '\n'
 '\n'
 'Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\t\tContribute\n'
 '\t\n'
 '\n'
 '\n'
 'HelpLearn to editCommunity portalRecent changesUpload fil