In [1]:
import pandas as pd
import json
import os
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium import webdriver
import csv
import re
from urllib.request import urlretrieve
import requests

In [2]:
dataset_folder = "./dataset"
if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)
    print("Dataset folder created")
else:
    print("Dataset folder already exists")

Dataset folder created


In [44]:
def saveToFile(content):
    file_dir = f"./dataset/{content['patent_id']}"
    if not os.path.exists(file_dir):
        os.makedirs(file_dir)
    file_path = f"./dataset/{content['patent_id']}/{content['patent_id']}.json"
    json_data = json.dumps(content, indent=4)
    with open(file_path, 'w') as json_file:
        json_file.write(json_data)
        print(f"File created for {content['patent_id']} at path: {file_path}")

In [6]:
global_classifications = []
def save_to_csv():
    global global_classifications
    with open('./scraper_list/classifiction_list.csv', 'a', newline = '') as result_file:
        wr = csv.writer(result_file, dialect='excel')
        for item in global_classifications:
            wr.writerow([item])

In [91]:
def scrape(patent_link):
    result = {}
    global global_classifications
    driver = webdriver.Chrome()
    driver.get(patent_link)
    driver.implicitly_wait(5)
    result['patent_link'] = patent_link

    #title, pubnum, abstract
    try:
        pubnum = driver.find_element(By.CSS_SELECTOR, "h2#pubnum")
        title = driver.find_element(By.CSS_SELECTOR, "h1#title")
        abstract = driver.find_element(By.CSS_SELECTOR, ".abstract.patent-text")
        result['patent_id'] = pubnum.text
        result['title'] = title.text
        result['abstract'] = abstract.text
    except: 
        result['patent_id'] = ''
        result['title'] = ''
        result['abstract'] = ''
    
    #Inventors and assignee
    try:
        people_div = driver.find_element(By.CSS_SELECTOR, ".important-people")
        inventors_st_mod = people_div.find_elements(By.CSS_SELECTOR, "state-modifier a")
        inventors = []
        for inventor in inventors_st_mod:
            inventors.append(inventor.text)
        result['inventors'] = inventors
    except:
        result['inventors'] = []

    try:
        assignee_element = driver.find_elements(By.CSS_SELECTOR, "dl.important-people dd")
        result['assignee'] = assignee_element[len(assignee_element)-1].text
    except:
        result['assignee'] = ""
    
    #classifications
    try:
        view_more_div = driver.find_element(By.CSS_SELECTOR, "classification-viewer div.more.classification-viewer")
        view_more_div.click()
        classifications = []
        classification_viewer = driver.find_element(By.CSS_SELECTOR, "classification-viewer.patent-result")
        a_tags = classification_viewer.find_elements(By.CSS_SELECTOR, "classification-tree.classification-viewer a")
        for a_tag in a_tags:
            if a_tag.text.strip():
                classifications.append(a_tag.text)
        result['classifications'] = classifications
        global_classifications = list(set(global_classifications + classifications))
    except:
        result['classifications'] = []

    try:
        abstract = driver.find_element(By.CSS_SELECTOR, ".abstract.patent-text")
        result['abstract'] = abstract.text
    except:
        result['abstract'] = ""
    try:
        claims_div = driver.find_element(By.CSS_SELECTOR, ".claims.patent-text")
        claims = claims_div.find_elements(By.CSS_SELECTOR, ".claim-text.patent-text")
        claims_text = ""
        for claim in claims:
            if re.match(r"[0-9]+.", claim.text):
                claims_text += "\n"+claim.text
        result['claims'] = claims_text
    except Exception:
        result['claims'] = ""
    try:
        app_tl_div = driver.find_element(By.CSS_SELECTOR, "application-timeline.patent-result > div.application-timeline")
        app_tl_children = app_tl_div.find_elements(By.CSS_SELECTOR, "div .event.layout.horizontal.application-timeline")
        for child in app_tl_children:
            split_child = child.text.split('\n')
            if split_child[0] == 'Status':
                result['status'] = split_child[1]
    except Exception:
        result['status'] = ""

    try:
        footer_div = driver.find_element(By.CSS_SELECTOR, "div.footer.patent-result")

        h3s_element = footer_div.find_elements(By.CSS_SELECTOR, "h3.patent-result")
        citations_idx = -1
        citedby_idx = -1
        for idx, h3 in enumerate(h3s_element):
            h3_text = h3.text
            if h3_text.startswith("Cited By"):
                citedby_idx = idx
            if h3_text.startswith("Patent Citations"):
                citations_idx = idx
        
        #print("C1, c2 => ", citations_idx, citedby_idx)

        pt_citations_element = footer_div.find_element(By.CSS_SELECTOR, "h3#patentCitations")
        pt_citations_text = pt_citations_element.text
        match = re.search(r'\((\d+)\)', pt_citations_text)
        total_citations = match.group(1)
        #print(total_citations, " - Total Citations")

        responsive_tbl_divs = footer_div.find_elements(By.CSS_SELECTOR, "div.responsive-table.patent-result")
        tr_divs = responsive_tbl_divs[citations_idx].find_elements(By.CSS_SELECTOR, "div.tr.patent-result")
        tr_divs = tr_divs[1:]
        citations_own = []
        own_citations_full = False
        citations_ftf = []
        for item in tr_divs:
            #print(item.text)
            patent_id = item.text.split(" ")[0]
            if patent_id == 'Family':
                own_citations_full = True
                continue
            if own_citations_full:
                citations_ftf.append(patent_id)
            else:
                citations_own.append(patent_id)

        #print(citations_own)
        #print(len(citations_own))
        #print(len(citations_ftf))
        result['citations_own'] = citations_own
        result['citations_ftf'] = citations_ftf

        pt_cited_by_element = footer_div.find_element(By.CSS_SELECTOR, "h3#citedBy")
        pt_citedby_text = pt_cited_by_element.text
        match = re.search(r'\((\d+)\)', pt_citedby_text)
        total_citedby = match.group(1)
        #print(total_citedby, " - Total Cited By")

        tr_divs = responsive_tbl_divs[citedby_idx].find_elements(By.CSS_SELECTOR, "div.tr.patent-result")
        tr_divs = tr_divs[1:]
        #print("length - ", len(tr_divs))
        citedby_own = []
        own_citedby_full = False
        citedby_ftf = []
        for item in tr_divs:
            #print(item.text)
            patent_id = item.text.split(" ")[0]
            if patent_id == 'Family':
                own_citedby_full = True
                continue
            if own_citedby_full:
                citedby_ftf.append(patent_id)
            else:
                citedby_own.append(patent_id)
        #print(len(citedby_own))
        #print(len(citedby_ftf))
        result['citedby_own'] = citedby_own
        result['citedby_ftf'] = citedby_ftf

    except Exception:
        result['citations_own'] = []
        result['citations_ftf'] = []
        result['citedby_own'] = []
        result['citedby_ftf'] = []
    return result

In [40]:
def test_selenium(url):
    driver = webdriver.Chrome()
    driver.get(url)
    driver.implicitly_wait(5) 
    pubnum = driver.find_element(By.CSS_SELECTOR, "h2#pubnum")
    title = driver.find_element(By.CSS_SELECTOR, "h1#title")
    abstract = driver.find_element(By.CSS_SELECTOR, ".abstract.patent-text")
    claims_div = driver.find_element(By.CSS_SELECTOR, ".claims.patent-text")
    claims = claims_div.find_elements(By.CSS_SELECTOR, ".claim-text.patent-text")
    print("Publication Number: ", pubnum.text)
    print("Title: ", title.text)
    print("Abstract: ", abstract.text)
    print("-----")
    #Inventors
    result = {}
    try:
        people_div = driver.find_element(By.CSS_SELECTOR, ".important-people")
        inventors_st_mod = people_div.find_elements(By.CSS_SELECTOR, "state-modifier a")
        inventors = []
        for inventor in inventors_st_mod:
            inventors.append(inventor.text)
        result['inventors'] = inventors
    except:
        result['inventors'] = []

    try:
        assignee_element = driver.find_elements(By.CSS_SELECTOR, "dl.important-people dd")
        result['assignee'] = assignee_element[len(assignee_element)-1].text
    except:
        result['assignee'] = ""

    print(result)
    print("--------")
#test_selenium()
#test_selenium("https://patents.google.com/patent/US20190378050A1/en?q=(machine+learning)&oq=machine+learning")

In [57]:
def scraper(patent_link):
    print(f"Fetching patent from: {patent_link}")
    result = scrape(patent_link)
    saveToFile(result)
    citations = []
    count = 0
    for citation in result['citations_own']:
        citation = citation.strip()
        if not citation.startswith("US"): continue
        patent_link = f"https://patents.google.com/patent/{citation}/en"
        citations.append(patent_link)
        count += 1
        if count == 10: break
    
    return citations
        


In [58]:
root_patent = "https://patents.google.com/patent/US20220253443A1/en"
root_patent_id = "US20220253443A1"


In [72]:
def save_citation_list(citations, filepath, mode):
    with open(filepath, mode) as file:
        for citation in citations:
            file.write(f"{citation}\n")
    
    print(f"{filepath}.txt file saved successfully")

In [79]:
def read_from_file(filepath):
    with open(filepath, 'r') as file:
        lines = file.readlines()

    lines = [line.strip() for line in lines]
    return lines

In [73]:
level_n_citations = []
n = 1

In [74]:
level_n_citations = scraper(root_patent)

Fetching patent from: https://patents.google.com/patent/US20220253443A1/en
File created for US20220253443A1 at path: ./dataset/US20220253443A1/US20220253443A1.json


In [76]:
filepath = f"./scraper_list/level_{n}"
save_citation_list(level_n_citations, filepath, 'w')

./scraper_list/level_1.txt file saved successfully


In [95]:
n = 4
filepath = f"./scraper_list/level_{n}"
level_n_citations = read_from_file(filepath)
print(f"Total citations to be parsed: {len(level_n_citations)}")

Total citations to be parsed: 720


In [96]:
citations = []
count = 1
for citation in level_n_citations:
    print(f"Fetching document: {count}")
    citation_list = scraper(citation)
    citations += list(set(citation_list))
    count += 1

n += 1
filepath = f"./scraper_list/level_{n}"
print(f"Saving citations for level {n} to {filepath}")
save_citation_list(citations, filepath, 'w')

Fetching document: 1
Fetching patent from: https://patents.google.com/patent/US6085193A/en
File created for US6085193A at path: ./dataset/US6085193A/US6085193A.json
Fetching document: 2
Fetching patent from: https://patents.google.com/patent/US6098096A/en
File created for US6098096A at path: ./dataset/US6098096A/US6098096A.json
Fetching document: 3
Fetching patent from: https://patents.google.com/patent/US5924116A/en
File created for US5924116A at path: ./dataset/US5924116A/US5924116A.json
Fetching document: 4
Fetching patent from: https://patents.google.com/patent/US5790935A/en
File created for US5790935A at path: ./dataset/US5790935A/US5790935A.json
Fetching document: 5
Fetching patent from: https://patents.google.com/patent/US4996642A/en
File created for US4996642A at path: ./dataset/US4996642A/US4996642A.json
Fetching document: 6
Fetching patent from: https://patents.google.com/patent/US5956039A/en
File created for US5956039A at path: ./dataset/US5956039A/US5956039A.json
Fetching d