In [3]:
!pip install requests beautifulsoup4 transformers selenium
!apt-get update
!apt install chromium-chromedriver


Collecting selenium
  Downloading selenium-4.22.0-py3-none-any.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.0-py3-none-any.whl (475 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.7/475.7 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from transformers import pipeline
import sqlite3
import os


# Setup ChromeDriver options for Colab
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Path to ChromeDriver executable compatible with Colab
chrome_driver_path = '/usr/lib/chromium-browser/chromedriver'

# Initialize ChromeDriver correctly
driver = webdriver.Chrome(options=chrome_options)

# Function to fetch web content using requests and BeautifulSoup
def fetch_web_content(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

# Function to scrape content using BeautifulSoup
def scrape_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    paragraphs = soup.find_all('p')
    content = ' '.join([para.get_text() for para in paragraphs])
    return content

# Function to take screenshot using Selenium
def take_screenshot(url, filename):
    driver.get(url)
    driver.save_screenshot(filename)

# Function to generate content summary using Hugging Face's pipeline
def generate_summary(content):
    summarizer = pipeline("summarization")
    summary = summarizer(content, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
    return summary

# Function to initialize SQLite database
def initialize_database():
    conn = sqlite3.connect('workflow_results.db')
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS workflow_results (
            url TEXT PRIMARY KEY,
            scraped_content TEXT,
            llm_summary TEXT,
            screenshot_path TEXT
        )
    ''')
    conn.commit()
    conn.close()

# Function to store results in SQLite database
def store_results(url, scraped_content, llm_summary, screenshot_path):
    conn = sqlite3.connect('workflow_results.db')
    cursor = conn.cursor()
    cursor.execute('''
        INSERT OR REPLACE INTO workflow_results (url, scraped_content, llm_summary, screenshot_path)
        VALUES (?, ?, ?, ?)
    ''', (url, scraped_content, llm_summary, screenshot_path))
    conn.commit()
    conn.close()

# Function to fetch cached results from SQLite database
def fetch_cached_results(url):
    conn = sqlite3.connect('workflow_results.db')
    cursor = conn.cursor()
    cursor.execute('''
        SELECT scraped_content, llm_summary, screenshot_path FROM workflow_results WHERE url=?
    ''', (url,))
    row = cursor.fetchone()
    conn.close()
    return row

def handle_workflow(url):
    # Check if results are cached
    cached_results = fetch_cached_results(url)
    if cached_results:
        scraped_content, llm_summary, screenshot_path = cached_results
        print("Results fetched from cache:")
    else:
        # Fetch web content and scrape using BeautifulSoup
        html_content = fetch_web_content(url)
        scraped_content = scrape_content(html_content)[:1024]  # Truncate to 1024 tokens

        # Take screenshot
        screenshot_filename = 'screenshot.png'
        take_screenshot(url, screenshot_filename)
        screenshot_path = os.path.abspath(screenshot_filename)

        # Generate LLM summary
        llm_summary = generate_summary(scraped_content)

        # Store results in database
        store_results(url, scraped_content, llm_summary, screenshot_path)
        print("Results generated and stored in database:")

    # Print results or further process as required
    print(f"Scraped Content:\n{scraped_content}\n")
    print(f"LLM Generated Summary:\n{llm_summary}\n")
    print(f"Screenshot Path: {screenshot_path}")

# Example usage
if __name__ == "__main__":
    url = 'https://en.wikipedia.org/wiki/Generative_artificial_intelligence'
    handle_workflow(url)



No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Results generated and stored in database:
Scraped Content:

 Generative artificial intelligence (generative AI, GenAI,[1] or GAI) is artificial intelligence capable of generating text, images, videos, or other data using generative models,[2] often in response to prompts.[3][4] Generative AI models learn the patterns and structure of their input training data and then generate new data that has similar characteristics.[5][6]
 Improvements in transformer-based deep neural networks, particularly large language models (LLMs), enabled an AI boom of generative AI systems in the early 2020s. These include chatbots such as ChatGPT, Copilot, Gemini and LLaMA, text-to-image artificial intelligence image generation systems such as Stable Diffusion, Midjourney and DALL-E, and text-to-video AI generators such as Sora.[7][8][9][10] Companies such as OpenAI, Anthropic, Microsoft, Google, and Baidu as well as numerous smaller firms have developed generative AI models.[3][11][12]
 Generative AI has us