In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
def scrape_page(url):
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'lxml')
    table = soup.find('table', class_='listing_table')
    data = []

    # Check if the table exists on the page
    if table:
        # Extract data from each row in the table
        for row in table.find_all('tr')[1:]:  # Skip the header row
            columns = row.find_all('td')
            if len(columns) >= 4:  # Ensure that there are enough columns
                code  = columns[1].text.strip()
                match = columns[0].text.strip()
                title = columns[2].text.strip()
                score = columns[3].text.strip()

                # Scraping additional information from the linked page
                linked_page_info = scrape_linked_page(code)

                data.append({
                    'Match': match,
                    'code': code,
                    'Document Title': title,
                    'Score': score,
                    'Linked Page Info': linked_page_info  # Additional information from linked page
                })
            else:
                print("Skipping row with insufficient columns:", columns)
    else:
        print("Table not found on page:", url)

    return data




In [2]:
def scrape_linked_page(code):
    try:
        if re.match(r'^US\d+$', code):
            base_url = "https://www.freepatentsonline.com/y{}/{}.html".format(code[2:6], code[6:])
        else:
            base_url = "https://www.freepatentsonline.com/{}.html".format(code)

        response = requests.get(base_url)
        response.raise_for_status()
        html_content = response.text
        soup = BeautifulSoup(html_content, 'lxml')

        title_element = soup.find('div', class_='disp_elm_title', string='Title:')
        title = title_element.find_next('div', class_='disp_elm_text').text.strip() if title_element else "Title not found"

        abstract_element = soup.find('div', class_='disp_elm_title', string='Abstract:')
        abstract = abstract_element.find_next('div', class_='disp_elm_text').text.strip() if abstract_element else "Abstract not found"

        inventors_element = soup.find('div', class_='disp_elm_title', string='Inventors:')
        inventors = inventors_element.find_next('div', class_='disp_elm_text').text.strip() if inventors_element else "Inventors not found"

        publication_date_element = soup.find('div', class_='disp_elm_title', string='Publication Date:')
        publication_date = publication_date_element.find_next('div', class_='disp_elm_text').text.strip() if publication_date_element else "Publication Date not found"

        filing_date_element = soup.find('div', class_='disp_elm_title', string='Filing Date:')
        filing_date = filing_date_element.find_next('div', class_='disp_elm_text').text.strip() if filing_date_element else "Filing Date not found"
        
        assignee_element = soup.find('div', class_='disp_elm_title', string='Assignee:')
        assignee = assignee_element.find_next('div', class_='disp_elm_text').text.strip() if assignee_element else "Assignee not found"

        
        foreign_references_element = soup.find('div', class_='disp_elm_title', string='Foreign References:')
        foreign_references = foreign_references_element.find_next('div', class_='disp_elm_text').text.strip() if foreign_references_element else "Foreign References not found"


        return title, abstract, inventors, assignee, foreign_references, publication_date, filing_date
    except Exception as e:
        print("Error scraping linked page for code {}: {}".format(code, e))
        return "Error", "Error", "Error", "Error", "Error", "Error", "Error"

In [3]:
# URL pattern for the first page
base_url = "https://www.freepatentsonline.com/result.html?p={}&sort=relevance&srch=top&query_txt=sustainable+aviation+fuel&patents_us=on"
# Initialize an empty list to store all data
all_data = []

# Scrape data from all 50 pages
for page_num in range(1,50):
    url = base_url.format(page_num)
    page_data = scrape_page(url)
    all_data.extend(page_data)




Error scraping linked page for code 10753235: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


In [4]:
df = pd.DataFrame(all_data)


In [8]:
df['Title'], df['Abstract'], df['Inventors'], df['assignee'], df['foreign_references'],df['publication_date'],df['Filing Date']= zip(*df['Linked Page Info'])
df.drop(columns=['Linked Page Info'], inplace=True)

print(df)

     Match           code                                     Document Title  \
0        1       11585278  Aviation fuel\n         \n                    ...   
1        2       11566193  Aviation fuel composition\n         \n        ...   
2        3  US20190002778  AVIATION FUEL COMPOSITION\n         \n        ...   
3        4  US20230108354  Systems, Methods and Apparatus for Producing S...   
4        5        8628594  High octane unleaded aviation fuel\n         \...   
...    ...            ...                                                ...   
2445  2446        8673603  Fermentation process for controlling butanedio...   
2446  2447  US20190185155  VERTICAL TAKEOFF AND LANDING AIRCRAFT\n       ...   
2447  2448  US20180222580  VERTICAL TAKEOFF AND LANDING AIRCRAFT\n       ...   
2448  2449       11912404  Vertical takeoff and landing aircraft\n       ...   
2449  2450       10994838  Vertical takeoff and landing aircraft\n       ...   

     Score                             

In [11]:
df.head()

Unnamed: 0,Match,code,Document Title,Score,Title,Abstract,Inventors,assignee,foreign_references,publication_date,Filing Date
0,1,11585278,Aviation fuel\n \n ...,1000,Aviation fuel,A method of determining one or more fuel chara...,"Swann, Peter (Derby, GB) \n ...","ROLLS-ROYCE plc (London, GB)",EP08269562005-07-27Detector arrangement for a ...,02/21/2023,06/29/2022
1,2,11566193,Aviation fuel composition\n \n ...,943,Aviation fuel composition,The present invention relates to an aviation f...,"Sandberg, Kati (Järvenpää, FI) ...","Neste Corporation (Espoo, FI)",WO2005026297A12005-03-24PETROLEUM- AND FISCHER...,01/31/2023,12/14/2016
2,3,US20190002778,AVIATION FUEL COMPOSITION\n \n ...,937,AVIATION FUEL COMPOSITION,The present invention relates to an aviation f...,"Sandberg, Kati (Järvenpää, FI) ...","Neste Corporation (Espoo, FI)",WO2013012983A12013-01-24,01/03/2019,12/14/2016
3,4,US20230108354,"Systems, Methods and Apparatus for Producing S...",870,"Systems, Methods and Apparatus for Producing S...","Systems, methods and apparatus are provided th...","Dvorin, Jason (Southlake, TX, US) ...","Kepler GTL LLC (Southlake, TX, US)",Foreign References not found,04/06/2023,02/25/2022
4,5,8628594,High octane unleaded aviation fuel\n \...,855,High octane unleaded aviation fuel,An unleaded aviation fuel blend. The fuel blen...,"Braly, George W. (Ada, OK, US)",BRALY GEORGE W.,AU2006351908B22011-03-10CA2672211A12008-06-19E...,01/14/2014,12/01/2010


In [13]:
# Specify the file path where the CSV file will be saved
file_path = 'C:/Users/hp/id2/bg/PatentsUs.csv'


# Save DataFrame to the specified CSV file path
df.to_csv(file_path, index=False, sep=',', encoding='utf-8', quotechar='"')


In [14]:
# Save DataFrame to JSON file
json_file_path = 'C:/Users/hp/id2/bg/PatentsUs.json'
df.to_json(json_file_path, orient='records')

In [15]:
# Save DataFrame to HDF5 file
hdf_file_path = 'C:/Users/hp/id2/bg/PatentsUs.h5'
df.to_hdf(hdf_file_path, key='data', mode='w')

In [16]:
df.shape

(2450, 11)