<a href="https://colab.research.google.com/github/usvenkat/AI-Agent_Developer-2/blob/main/Web_finder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install googlesearch-python requests beautifulsoup4



In [None]:
import os
from googlesearch import search
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
from IPython.display import display, HTML


In [None]:

def get_topic_info(topic):

    important_points = []
    time.sleep(2)
    print(f"Searching the web for: {topic}...")

    try:

        for url in search(topic, num_results=5, lang='en'):
            print(f"Checking general info URL: {url}")
            try:
                response = requests.get(url, timeout=5)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')

                paragraphs = soup.find_all('p')
                for p in paragraphs[:3]:
                    text = p.get_text(strip=True)
                    if len(text) > 50:
                        important_points.append(text)

            except requests.exceptions.RequestException as e:
                print(f"Could not access {url}: {e}")
            except Exception as e:
                print(f"Error parsing {url}: {e}")

            if len(important_points) > 10:
                break

    except Exception as e:
        print(f"An error occurred during web search for general info: {e}")

    return important_points



In [None]:
def find_and_display_image(topic):

    print(f"\nSearching Google Images for: {topic}...")
    image_search_query = f"{topic} image"
    direct_image_url = None

    try:
        for url in search(image_search_query, num_results=5, lang='en', safe='on'):
            if "images.google.com" in url or "google.com/images" in url:
                print(f"Checking image search result URL: {url}")
                try:
                    response = requests.get(url, timeout=5)
                    response.raise_for_status()
                    soup = BeautifulSoup(response.text, 'html.parser')


                    for meta in soup.find_all('meta', property="og:image"):
                        direct_image_url = meta.get('content')
                        if direct_image_url and (direct_image_url.startswith('http') or direct_image_url.startswith('https')):
                            print(f"Found potential direct image URL from og:image: {direct_image_url}")
                            break

                    if not direct_image_url:

                        for img_tag in soup.find_all('img', src=True):
                            src = img_tag.get('src')
                            if src and (src.startswith('http') or src.startswith('https')) and ('q=tbn' not in src): # Filter out thumbnails
                                if any(ext in src for ext in ['.jpg', '.jpeg', '.png', '.gif']):
                                    direct_image_url = src
                                    print(f"Found potential direct image URL from img src: {direct_image_url}")
                                    break
                    if direct_image_url:
                        break

                except requests.exceptions.RequestException as e:
                    print(f"Could not access {url} for image: {e}")
                except Exception as e:
                    print(f"Error parsing {url} for image: {e}")
            if direct_image_url:
                break


    except Exception as e:
        print(f"An error occurred during Google Images search: {e}")

    if direct_image_url:
        print(f"\nOpening image in your default web browser: {direct_image_url}")
        try:
            webbrowser.open_new_tab(direct_image_url)
            return direct_image_url
        except Exception as e:
            print(f"Could not open browser: {e}")
            return None
    else:
        print("\nCould not find a direct image URL to display.")
        return None


In [None]:

def create_info_file(topic, points, image_url=None):
    """
    Creates a text file with the extracted important points and an image URL (if found).
    """
    filename = f"{topic.replace(' ', '_').lower()}_info.txt"
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"--- Information about: {topic.upper()} ---\n\n")
            f.write("Important Points:\n")
            if points:
                for i, point in enumerate(points):
                    f.write(f"{i+1}. {point}\n\n")
            else:
                f.write("No significant points found.\n\n")

            if image_url:
                f.write("\n--- Image Displayed (Opened in Browser) ---\n")
                f.write(f"The image was opened in your default web browser. You can also view it at:\n{image_url}\n")
            else:
                f.write("No direct image was found or opened in browser.\n")

        print(f"\nInformation saved to: {filename}")
    except IOError as e:
        print(f"Error writing to file {filename}: {e}")



In [None]:

def get_info_from_web(query, num_results=3):

    print(f"Searching the web for: {query}")
    extracted_info = []

    try:

        search_results = search(query, num_results=num_results, lang='en')

        for url in search_results:
            print(f"Processing URL: {url}")
            try:

                response = requests.get(url, timeout=5)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')


                paragraphs = soup.find_all('p')
                page_text = ""
                for p in paragraphs[:5]:
                    text = p.get_text(strip=True)
                    if len(text) > 100:
                        page_text += text + "\n\n"

                if page_text:
                    extracted_info.append({"url": url, "content": page_text})
                else:
                    extracted_info.append({"url": url, "content": "No significant text found on this page."})


            except requests.exceptions.RequestException as e:
                print(f"Could not access {url}: {e}")
                extracted_info.append({"url": url, "content": f"Could not access page: {e}"})
            except Exception as e:
                print(f"Error parsing {url}: {e}")
                extracted_info.append({"url": url, "content": f"Error parsing page: {e}"})

            time.sleep(1)

    except Exception as e:
        print(f"An error occurred during web search: {e}")

    return extracted_info


In [None]:


def display_google_cse(cx_code="23780fbb727f84965"):

  google_cse_code = f"""
  <script async src="https://cse.google.com/cse.js?cx={cx_code}">
  </script>
  <div class="gcse-search"></div>
  """

  display(HTML(google_cse_code))


if __name__ == "__main__":
    topic_input = input("Enter the topic you want to search for: ")


    points = get_topic_info(topic_input)




    print()
    print()
    print("Please review the generated text file for information.")
    if found_image_url:
        time.sleep(2)
        print("An image related to your topic should have opened in your browser.")
        print()
    information = get_info_from_web(topic_input)

    print("\n--- Extracted Information ---")
    if information:
        for item in information:
            print(f"From URL: {item['url']}")
            print("Content:")
            print()
            print()
            print()
            print()
            print()
            print(item['content'])
            print()
            print()
            print()
            print()
            print("-" * 30)
    else:
        print("No information could be extracted.")

    print("Saving file .....")
    found_image_url = find_and_display_image(topic_input)
    create_info_file(topic_input, points, found_image_url)

    print("If u wanna search manually ")
    time.sleep(2)
    print("rendering the search block ........")
    time.sleep(2)
    print()
    print("--------------GOOGLE SEARCH----------------")
    print()
    display_google_cse()
    print()
    print("-------------------------------------------")


Enter the topic you want to search for: nvidia rtx 5090
Searching the web for: nvidia rtx 5090...
Checking general info URL: https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/
Checking general info URL: https://marketplace.nvidia.com/en-us/consumer/graphics-cards/gigabyte-nvidia-geforce-rtx-5090-windforce-overclocked-triple-fan/
Could not access https://marketplace.nvidia.com/en-us/consumer/graphics-cards/gigabyte-nvidia-geforce-rtx-5090-windforce-overclocked-triple-fan/: HTTPSConnectionPool(host='marketplace.nvidia.com', port=443): Read timed out. (read timeout=5)
Checking general info URL: https://www.amazon.com/Nvidia-GeForce-RTX-5090-Founders/dp/B0DYDY8KSC
Checking general info URL: https://www.polygon.com/gaming/545482/rtx-5090-review-nvidia-graphics-card-gpu
Checking general info URL: https://www.google.com/search?num=7

--- Process Complete ---


Please review the generated text file for information.
Searching the web for: nvidia rtx 5090
Processing URL: htt


-------------------------------------------
