In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
def scrape_page(url):
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'lxml')
    table = soup.find('table', class_='listing_table')
    data = []

    # Check if the table exists on the page
    if table:
        # Extract data from each row in the table
        for row in table.find_all('tr')[1:]:  # Skip the header row
            columns = row.find_all('td')
            if len(columns) >= 4:  # Ensure that there are enough columns
                code  = columns[1].text.strip()
                match = columns[0].text.strip()
                title = columns[2].text.strip()
                score = columns[3].text.strip()

                # Scraping additional information from the linked page
                linked_page_info = scrape_linked_page(code)

                data.append({
                    'Match': match,
                    'code': code,
                    'Document Title': title,
                    'Score': score,
                    'Linked Page Info': linked_page_info  # Additional information from linked page
                })
            else:
                print("Skipping row with insufficient columns:", columns)
    else:
        print("Table not found on page:", url)

    return data


In [5]:
def scrape_linked_page(code):
    try:
        
        base_url = "https://www.freepatentsonline.com/{}.html".format(code)

        response = requests.get(base_url)
        response.raise_for_status()
        html_content = response.text
        soup = BeautifulSoup(html_content, 'lxml')

        title_element = soup.find('div', class_='disp_elm_title', string='Title:')
        title = title_element.find_next('div', class_='disp_elm_text').text.strip() if title_element else "Title not found"

        abstract_element = soup.find('div', class_='disp_elm_title', string='Abstract:')
        abstract = abstract_element.find_next('div', class_='disp_elm_text').text.strip() if abstract_element else "Abstract not found"

        inventors_element = soup.find('div', class_='disp_elm_title', string='Inventors:')
        inventors = inventors_element.find_next('div', class_='disp_elm_text').text.strip() if inventors_element else "Inventors not found"

        publication_date_element = soup.find('div', class_='disp_elm_title', string='Publication Date:')
        publication_date = publication_date_element.find_next('div', class_='disp_elm_text').text.strip() if publication_date_element else "Publication Date not found"

        filing_date_element = soup.find('div', class_='disp_elm_title', string='Filing Date:')
        filing_date = filing_date_element.find_next('div', class_='disp_elm_text').text.strip() if filing_date_element else "Filing Date not found"
        
        assignee_element = soup.find('div', class_='disp_elm_title', string='Assignee:')
        assignee = assignee_element.find_next('div', class_='disp_elm_text').text.strip() if assignee_element else "Assignee not found"

        
        foreign_references_element = soup.find('div', class_='disp_elm_title', string='Foreign References:')
        foreign_references = foreign_references_element.find_next('div', class_='disp_elm_text').text.strip() if foreign_references_element else "Foreign References not found"


        return title, abstract, inventors, assignee, foreign_references, publication_date, filing_date
    except Exception as e:
        print("Error scraping linked page for code {}: {}".format(code, e))
        return "Error", "Error", "Error", "Error", "Error", "Error", "Error"