In [127]:
import pandas as pd
import json
import os
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium import webdriver

In [128]:
df = pd.read_csv("scraper_list/google_list_v2.csv")

In [129]:
print(len(df))

19502


In [130]:
df.head()

Unnamed: 0,id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link
0,US-11604547-B2,Multipoint touchscreen,Apple Inc.,"Steve Hotelling, Joshua A. Strickon, Brian Q. ...",2004-05-06,2021-01-27,2023-03-14,2023-03-14,https://patents.google.com/patent/US11604547B2/en,https://patentimages.storage.googleapis.com/a1...
1,US-11029838-B2,"Touch screen device, method, and graphical use...",Apple Inc.,"Stephen O. Lemay, Richard Williamson",2006-09-06,2019-12-04,2021-06-08,2021-06-08,https://patents.google.com/patent/US11029838B2/en,https://patentimages.storage.googleapis.com/bf...
2,US-11379541-B2,System and method for adapting a control funct...,Autoconnect Holdings Llc,Christopher P. Ricci,2013-04-15,2021-04-16,2022-07-05,2022-07-05,https://patents.google.com/patent/US11379541B2/en,https://patentimages.storage.googleapis.com/f0...
3,US-11390881-B2,Soybean event MON89788 and methods for detecti...,"Monsanto Technology, Llc","Marianne Malven, Jennifer Rinehart, Nancy Tayl...",2005-05-27,2020-07-01,2022-07-19,2022-07-19,https://patents.google.com/patent/US11390881B2/en,https://patentimages.storage.googleapis.com/3f...
4,US-11423886-B2,Task flow identification based on user intent,Apple Inc.,"Thomas Robert Gruber, Adam John Cheyer, Dag Ki...",2010-01-18,2020-05-20,2022-08-23,2022-08-23,https://patents.google.com/patent/US11423886B2/en,https://patentimages.storage.googleapis.com/6a...


In [131]:
df.columns

Index(['id', 'title', 'assignee', 'inventor/author', 'priority date',
       'filing/creation date', 'publication date', 'grant date', 'result link',
       'representative figure link'],
      dtype='object')

In [142]:
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless")
driver=webdriver.Chrome(options=chrome_options)

In [133]:
def test_selenium():
    driver.get(df['result link'][1])
    driver.implicitly_wait(1) 
    pubnum = driver.find_element(By.CSS_SELECTOR, "h2#pubnum")
    title = driver.find_element(By.CSS_SELECTOR, "h1#title")
    abstract = driver.find_element(By.CSS_SELECTOR, ".abstract.patent-text")
    claims_div = driver.find_element(By.CSS_SELECTOR, ".claims.patent-text")
    claims = claims_div.find_elements(By.CSS_SELECTOR, ".claim-text.patent-text")
    print(pubnum.text)
    print(title.text)
    print(abstract.text)
    print("-----")
    for claim in claims:
        print(claim.text)
# test_selenium()

In [134]:
dataset_folder = "./dataset"
if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)
    print("Dataset folder created")
else:
    print("Dataset folder already exists")

Dataset folder already exists


In [135]:
def create_patent_folder(patent_id):
    patent_folder_path = f"./dataset/{patent_id}"
    if not os.path.exists(patent_folder_path):
        os.makedirs(patent_folder_path)
        print(f"Folder {patent_id} created")
    else:
        print(f"Folder {patent_id} already exists")
    return patent_folder_path

In [136]:
def saveToFile(content):
    file_path = f"./dataset/{content['patent_id']}/{content['patent_id']}.json"
    json_data = json.dumps(content, indent=4)
    with open(file_path, 'w') as json_file:
        json_file.write(json_data)
        print(f"File created for {content['patent_id']} at path: {file_path}")

In [145]:
def scrape(patent_link, result = {}):
    driver=webdriver.Chrome()
    driver.get(patent_link)
    driver.implicitly_wait(5)
    result['patent_link'] = patent_link
    #Inventors and assignee
    people_div = driver.find_element(By.CSS_SELECTOR, ".important-people")
    inventors_st_mod = people_div.find_elements(By.CSS_SELECTOR, "state-modifier a")
    inventors = []
    for inventor in inventors_st_mod:
        inventors.append(inventor.text)
    result['inventors'] = inventors
    try:
        abstract = driver.find_element(By.CSS_SELECTOR, ".abstract.patent-text")
        result['abstract'] = abstract.text
    except:
        result['abstract'] = ""
    try:
        claims_div = driver.find_element(By.CSS_SELECTOR, ".claims.patent-text")
        claims = claims_div.find_elements(By.CSS_SELECTOR, ".claim-text.patent-text")
        claims_text = []
        for claim in claims:
            claims_text.append(claim.text)
        result['claims'] = claims_text
    except Exception:
        result['claims'] = []
    return

In [146]:
count = 0
for index,row in df.iterrows():
    patent_id = row['id']
    content = {}
    content['patent_id'] = patent_id
    content['title'] = row['title']
    content['assignee'] = row['assignee']
    content['publication_date'] = row['publication date']
    patent_link = row['result link']
    print(f"Fetching patent: {patent_id} from {patent_link}")
    patent_folder_path = create_patent_folder(patent_id)
    if not os.path.exists(patent_folder_path+f"/{patent_id}.json"):
        scrape(patent_link, content)
        saveToFile(content)
        print(f"File {patent_folder_path}/{patent_id}.json created successfully.")
    else:
        print(f"File {patent_folder_path}/{patent_id}.json already exists.")
    count += 1
    if count == 2:
        break
print(count)

Fetching patent: US-11604547-B2 from https://patents.google.com/patent/US11604547B2/en
Folder US-11604547-B2 created
File created for US-11604547-B2 at path: ./dataset/US-11604547-B2/US-11604547-B2.json
File ./dataset/US-11604547-B2/US-11604547-B2.json created successfully.
Fetching patent: US-11029838-B2 from https://patents.google.com/patent/US11029838B2/en
Folder US-11029838-B2 created
File created for US-11029838-B2 at path: ./dataset/US-11029838-B2/US-11029838-B2.json
File ./dataset/US-11029838-B2/US-11029838-B2.json created successfully.
2
