<a href="https://colab.research.google.com/github/tahmidefaz/GPT-browsing-plugin/blob/main/notebooks/GPT_browsing_plugin_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT Browsing Pluging

### A browsing plugin for Huggingface based chat LLM models

In [None]:
#@title Install Dependencies

!pip install accelerate>=0.12.0 transformers[torch]==4.25.1
!pip install newspaper3k
!pip install wikipedia

In [2]:
#@title Download Language Model
#@markdown > *Loading the smallest Dolly 2.0 model (3B) by default. I found it very capable with the browsing plugin.
#@markdown I was not able to try out other similar models because of GPU memory restrictions.*

import torch
from transformers import pipeline

language_model = "databricks/dolly-v2-3b" #@param{type: 'string'}
generate_text = pipeline(model=language_model, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")


In [3]:
#@title Setup Search Engine
#@markdown ##### Setting up the search engine and the page scrapers here

import requests
from bs4 import BeautifulSoup

import requests
from newspaper import Article

def search_ddg(query):
    url = 'https://duckduckgo.com/html/'
    params = {
        'q': query,
        's': '0',
        'nextParams': '',
        'v': 'l',
        'o': 'json',
        'dc': 'us-en'
    }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    res = requests.get(url, params=params, headers=headers)
    soup = BeautifulSoup(res.text, 'html.parser')
    results = []
    for result in soup.select('div.result'):
        title = result.select_one('.result__title').text
        url = result.select_one('a.result__url')['href']
        snippet = result.select_one('.result__snippet').text
        results.append({'title': title, 'url': url, 'snippet': snippet})
    return results

def read_page(url):
  try:
    print('Reading page')
    article = Article(url)
    article.download()
    article.parse()
    return article.text[:2000]
  except Exception as e:
    print("exception reading page:", e)
    return ''

# results = search_ddg('fusion breakthrough')
# for result in results[:3]:
#     print(result['title'])
#     # print(result['url'])
#     print(result['snippet'])
#     print()



### Browsing Parameters Explanation
`search_topic` - The search topic. This is different from the user questions. This can be things like "Trappist-1 planets", "SVB collapse" etc.

`show_search_results` - When checked, prints out the parsed content that is passed to the LLM.

`snippets_to_use` - The number of search snippets to use for content when `open_links` is uncheked. The model seems to find the snippets very useful for answering questions.

`open_links` - Whether or not to click on search engine results and scrape content from the different pages.

`links_to_open` - The number of pages to click through for content when `open_links` is checked.

`max_page_chars` - Contents approximately greater then this number is truncated. Large numbers may not fit in the GPU, and might make LLM response slower.

`use_wikipedia` - Only uses Wikipedia for the `search_topic` when checked.

`max_wiki_article_chars` - Similar to `max_page_chars`, but for Wikipedia articles.

In [30]:
#@title Fetch Search Results

#@markdown ###**Parameters**
search_results = '<<Search Results>>\n\n'

search_topic = 'Marty McFly' #@param{type: 'string'}

show_search_results = False #@param{type: 'boolean'}

#@markdown ### Search Engine options
snippets_to_use = 5 #@param{type: 'number'}

open_links = True #@param{type: 'boolean'}
links_to_open = 2 #@param{type: 'number'}
max_page_chars = 1800 #@param{type: 'number'}

#@markdown ### Wikipedia options
use_wikipedia = False #@param{type: 'boolean'}
max_wiki_article_chars = 2000 #@param{type: 'number'}

if use_wikipedia:
  import wikipedia
  from wikipedia.exceptions import DisambiguationError, PageError

  try:
    # Get a WikipediaPage object for a page
    page = wikipedia.page(search_topic)

    search_results += f'{page.title}\n{page.content}'
    search_results = search_results[:max_wiki_article_chars]
    print(f"Read Wikipedia article titled: {page.title}")
  except DisambiguationError as e:
      # Handle disambiguation pages
      print(e.options)
  except PageError as e:
      # Handle page not found errors
      print(e)
else:
  results = search_ddg(search_topic)
  if open_links:
    target_content_length = max_page_chars // links_to_open
    i = 0
    opened_pages = 0
    while opened_pages < links_to_open and i < len(results):
      page_title = results[i]['title']
      search_snippet = results[i]['snippet']
      print("Clicked on:", page_title)
      page_content = read_page(results[i]['url'])
      print("Finished reading page")
      i += 1
      if len(page_content) < target_content_length:
        print("not enough content in page, skipping...\n")
        continue
      print("\n")
      search_results += f'{opened_pages+1}. [TITLE]{page_title} [PAGE CONTENT] {page_content[:target_content_length]}\n'

      opened_pages += 1
  else:
    for i, result in enumerate(results[:snippets_to_use]):
        title = result['title']
        snippet = result['snippet']
        search_results += f'{i+1}. [TITLE]{title}[SNIPPET] {snippet}\n'
    print(f"Read {snippets_to_use} search engine snippets.")

if show_search_results:
  print("Search Result/Article length:", len(search_results))
  print(search_results)

chat_log = []

Clicked on: 
Marty McFly - Wikipedia

Reading page
Finished reading page


Clicked on: 
Back to the Future (1985) - Michael J. Fox as Marty McFly - IMDb

Reading page
Finished reading page
not enough content in page, skipping...

Clicked on: 
Michael J. Fox - Wikipedia

Reading page
Finished reading page




In [None]:
#@title Ask Questions

system_str = "<<SYSTEM>>\nYou are a search assistant that reads the returned search results and your own knowledge to answer the user queries. If you use the search result, make sure to reference it using a superscript. Like ^2, ^3, ^1 etc."
user_query = "Who is Marty McFly?" #@param{type: 'string'}

chat_history = ''
for i, chat in enumerate(chat_log):
  if (i+1)%2 != 0:
    chat_history += f'<<User>>\n{chat}\n'
  else:
    chat_history += f'<<Assistant>>\n{chat}\n'

prompt = f"{system_str}\n\n{search_results}\n\n{chat_history}<<User>>\n{user_query}"
# print(prompt)

res = generate_text(prompt)
assistant_output = res[0]["generated_text"]

chat_log.append(user_query)
chat_log.append(assistant_output)

for i, chat in enumerate(chat_log):
  if (i+1)%2 != 0:
    print(f"[USER]\n{chat}\n")
  else:
    print(f"[ASSISTANT]\n{chat}\n")
