# LLM with Web Search and Crawl

Code to crawl the top n pages of a Google search result and serve them to LLM in order to utilize rich context.



In [1]:
import re
import requests
import sys
import os
from openai import AzureOpenAI
import tiktoken
from dotenv import load_dotenv
load_dotenv(override=True) 

client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_KEY"),  
  api_version="2024-08-01-preview"
)

CHAT_COMPLETIONS_MODEL = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME')

bs4 or scrapy?

In [None]:
import requests
import json
import scrapy
from bs4 import BeautifulSoup
import httpx
import asyncio
from urllib.parse import urljoin
from azure.ai.projects.models import MessageRole, BingGroundingTool
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
BING_GROUNDING_PROJECT_CONNECTION_STRING = os.getenv("BING_GROUNDING_PROJECT_CONNECTION_STRING")
BING_GROUNDING_AGENT_ID = os.getenv("BING_GROUNDING_AGENT_ID")
BING_GROUNDING_AGENT_MODEL_DEPLOYMENT_NAME = os.getenv("BING_GROUNDING_AGENT_MODEL_DEPLOYMENT_NAME")
BING_GROUNDING_CONNECTION_NAME = os.getenv("BING_GROUNDING_CONNECTION_NAME")
# Web search mode: "google" or "bing"
# it can be changed when users want to use different search engine
WEB_SEARCH_MODE = os.getenv("WEB_SEARCH_MODE")

def extract_text_and_tables_by_bs4(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    # Extract main text
    paragraphs = [p.get_text().strip() for p in soup.find_all("p") if p.get_text().strip()]
    text = "\n".join(paragraphs)
    return text


async def extract_text_and_tables_async(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    async with httpx.AsyncClient(timeout=3, follow_redirects=True) as client:
        try:
            response = await client.get(url, headers=headers)
            response.raise_for_status()
        except httpx.HTTPStatusError as e:
            # Handle 302 redirect manually if follow_redirects fails
            if e.response.status_code == 302 and "location" in e.response.headers:
                redirect_url = e.response.headers["location"]
                if not redirect_url.startswith("http"):
                    # handle relative redirects
                    redirect_url = urljoin(url, redirect_url)
                try:
                    response = await client.get(redirect_url, headers=headers)
                    response.raise_for_status()
                except Exception as e2:
                    print(f"Redirect request failed: {e2}")
                    return ""
            else:
                print(f"Request failed: {e}")
                return ""
        except httpx.HTTPError as e:
            print(f"Request failed: {e}")
            return ""

        selector = scrapy.Selector(text=response.text)
        paragraphs = [p.strip() for p in selector.css('p::text').getall() if p.strip()]
        text = "\n".join(paragraphs)
        return text

async def add_context_async(top_urls = []):
    async def gather_contexts():
        tasks = [extract_text_and_tables_async(url) for url in top_urls]
        results = await asyncio.gather(*tasks)
        return results
    return await gather_contexts()

def google_search(query, num=5):
    url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "q": query,
        "key": GOOGLE_API_KEY,
        "cx": GOOGLE_CSE_ID,
        "num": num, 
        "locale": "ko",  # 한국어로 검색
        "siteSearch": "samsung.com",
        "siteSearchFilter": "e",
    }
    response = requests.get(url, params=params)
    results = response.json()
    return results.get("items", [])

def bing_grounding_search(query, num=5, search_type="web"):
    try:
        creds = DefaultAzureCredential()
        
        project_client = AIProjectClient.from_connection_string(
            credential=creds,
            conn_str=BING_GROUNDING_PROJECT_CONNECTION_STRING,
        )
        
        agent_id = BING_GROUNDING_AGENT_ID
        
        if not agent_id:
            print("BING_GROUNDING_AGENT_ID is not set. Create new agent...")
            connection_name = BING_GROUNDING_CONNECTION_NAME
            
            bing_connection = project_client.connections.get(
                connection_name=connection_name,
            )
            conn_id = bing_connection.id
            
            bing = BingGroundingTool(connection_id=conn_id)
            
            
            agent = project_client.agents.create_agent(
                model=BING_GROUNDING_AGENT_MODEL_DEPLOYMENT_NAME,
                name="temporary-bing-agent",
                instructions="You are a helpful assistant that searches the web",
                tools=bing.definitions,
                headers={"x-ms-enable-preview": "true"}
            )
            agent_id = agent.id
            print(f"New agent created. Agent ID: {agent_id}")
        else:
            print(f"Existing agent ID: {agent_id}")
            try:
                agent = project_client.agents.get_agent(agent_id)
            except Exception as agent_error:
                print(f"Failed to retrieve agent: {agent_error}")
                return []

        thread = project_client.agents.create_thread()
        
        message = project_client.agents.create_message(
            thread_id=thread.id,
            role="user",
            content=f"Search the web for: {query}. Return only the top {num} most relevant results as a list.",
        )

        print(f"Message created, ID: {message.id}")

        run = project_client.agents.create_and_process_run(thread_id=thread.id, agent_id=agent.id)
        
        if run.status == "failed":
            print(f"Execution failed: {run.last_error}")
            return []
        print(f"Run completed successfully. Status: {run.status}")
        results = []
        response_message = project_client.agents.list_messages(thread_id=thread.id).get_last_message_by_role(
            MessageRole.AGENT
        )
        if response_message.url_citation_annotations:
            # Extract content text and annotations
            if response_message.content:
                for content_item in response_message["content"]:
                    if content_item["type"] == "text":
                        text_content = content_item["text"]["value"]
                        print("Extracted Text Content:")
                        print(text_content)
            for annotation in response_message.url_citation_annotations:
                if annotation["type"] == "url_citation":
                    url_citation = annotation["url_citation"]
                    url = url_citation["url"]
                    title = url_citation["title"]
                    # set the results same as google json format
                    results.append({"link": url, "title": title})

        if not BING_GROUNDING_AGENT_ID and 'agent' in locals() and hasattr(agent, 'id'):
            try:
                project_client.agents.delete_agent(agent.id)
                
            except Exception as delete_error:
                print(f"Error deleting agent: {delete_error}")

        return results if results else []
    except Exception as e:
        print(f"Bing Grounding error : {e}")
        return []

def web_search(query, num=5, search_type="web"):
    """환경 변수에 따라 Google Search API 또는 Bing Grounding을 사용하여 검색 수행"""
    
    if WEB_SEARCH_MODE == "bing":
        print(f"Bing Grounding 검색 사용: {query}")
        try:
            return bing_grounding_search(query, num, search_type)
            
        except Exception as e:
            print(f"Bing Grounding 검색 중 오류 발생: {e}")
    else:
        print(f"Google Search API 사용: {query}")
        return google_search(query, num, search_type)

       
QUERY_REWRITE_PROMPT = """
            <<지시문>>
            너는 구글 검색과 LLM 질의 최적화 전문가야. 사용자가 입력한 질문을 두 가지 목적에 맞게 재작성해.

            1. Web Search용 Query Rewrite:
            - 사용자의 질문을 실제 검색 엔진 검색창에 입력할 수 있도록, 명확하고 간결한 핵심 키워드 중심의 검색어로 재작성해.
            - 불필요한 문장, 맥락 설명은 빼고, 검색에 최적화된 형태로 만들어.
            - 핵심 키워드를 반복적으로 사용해 검색의 정확도를 높여.

            2. LLM Query용 Rewrite:
            - 사용자의 질문을 LLM이 더 잘 이해하고 답변할 수 있도록, 맥락과 의도를 명확히 드러내는 자연스러운 문장으로 재작성해.
            - 필요한 경우 추가 설명이나 세부 조건을 포함해서 질문의 목적이 분명히 드러나도록 만들어.
            - LLM이 답변에 집중할 수 있도록 핵심 단어를 반복 사용해.

            <<예시>>
            * 질문: 삼성전자 제품 중 2구 말고 다른 인덕션 추천해줘
            * 웹 검색용 재작성: 삼성전자 3구 이상 인덕션 추천
            * LLM 답변용 재작성: 삼성전자 인덕션 중 2구 모델이 아닌, 3구 이상 또는 다양한 화구 수를 가진 다른 인덕션 제품을 추천해 주세요. 각 모델의 주요 기능과 장점도 함께 알려주세요.

            <<질문>>
            {user_query}

            <<출력포맷>>
            반드시 아래와 같이 json 형식으로 출력해.
            {"web_search": "웹 검색용 재작성", "llm_query": "LLM 답변용 재작성"}
        """     
  
def rewrite_query_for_search_and_llm(query, client: AzureOpenAI):
        response = client.chat.completions.create(
            model=CHAT_COMPLETIONS_MODEL,
            messages=[
                {"role": "system", "content": QUERY_REWRITE_PROMPT},
                {"role": "user", "content": query}
            ],
            temperature=0.8,
            max_tokens=300,
            response_format= {"type": "json_object"},
        )
        
        return json.loads(response.choices[0].message.content.strip())







In [1]:
from IPython.display import Markdown, display
from datetime import datetime
import time

RESULTS_COUNT = 3

inputs = [
    "삼성전자 제품 중 2구 말고 다른 인덕션 추천해줘",
    "부모님에게 선물하고 싶은데 삼성전자 TV 추천해줘",
    "삼성전자 25년 제품이 작년 대비 좋아진것은",
    "삼성전자 JBL과 하만카돈 차이점이 뭐야",
    "갤럭시 버즈 이어버드 한쪽을 새로 구매했는데 페어링 어떻게 하나요",
    "삼성전자 S25 무게가 S24와 비교 했을때 얼마나 차이나"
]

#TODO 날씨나 뉴스, 기타 다른 특정정보는 Function Call
# inputs = ["날씨, 뉴스"] ##

async def process_web_search_call(RESULTS_COUNT, input):
    
    start_time = time.time()
    
    print(f"Original Input: {input}")
    
    query_rewrite = rewrite_query_for_search_and_llm(input, client)
    print(f"Web Search Query: {query_rewrite['web_search']}")
    print(f"LLM Query: {query_rewrite['llm_query']}")

    results = web_search(query_rewrite['web_search'], RESULTS_COUNT)
    if results and isinstance(results, list) and len(results) > 0:
        print(f"Web Search Results: {len(results)}")
        top_urls = [results[i]["link"] for i in range(len(results))]
        contexts = await add_context_async(top_urls)
    else:
        print("No results found or invalid response from web_search.")
        contexts = []

    # for i, context in enumerate(contexts):
    #     print(f"Context {i+1}: {context}...")  # Print first 1000 chars of each context
    #     print("\n--- End of Context ---\n")

    now = datetime.now()
    year = now.year
    month = now.month
    day = now.day

    system_prompt = "너는 삼성전자 제품 관련 정보를 제공하는 챗봇이야. 답변은 마크다운으로 이모지를 1~2개 포함해서 작성해줘."
    user_prompt = f"""
        너는 아래 제공하는 웹검색색에서 검색한 컨텍스트를 바탕으로 질문에 대한 답변을 제공해야 해. 컨텍스트를 최대한 활용하여 풍부하게 답변을 해야해. 
        현재는 {year}년 {month}월 {day}일이므로 최신의 데이터를 기반으로 답변을 해줘.
        구글에서 제공한 컨텍스트: {contexts}
        질문: {query_rewrite['llm_query']}
        """

    response = client.chat.completions.create(
        model=CHAT_COMPLETIONS_MODEL,
        messages=[{"role": "system", "content": system_prompt},
                 {"role": "user", "content": user_prompt}],
        top_p=0.9,
        max_tokens=1500
    )

    display(Markdown(response.choices[0].message.content))
    end_time = time.time()
    print(f"elapsed time: {end_time - start_time:.2f} seconds")

for input in inputs:
    await process_web_search_call(RESULTS_COUNT, input)

Original Input: 삼성전자 제품 중 2구 말고 다른 인덕션 추천해줘


NameError: name 'rewrite_query_for_search_and_llm' is not defined