In [1]:
!pip install fake_useragent bs4 pandas



You should consider upgrading via the 'c:\users\cktto\anaconda3\python.exe -m pip install --upgrade pip' command.


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from fake_useragent import UserAgent

In [3]:
import threading
from multiprocessing.pool import ThreadPool

pool_size = 16
pool = ThreadPool(pool_size)

lock = threading.Lock()

In [4]:
dir_path = 'data'
out_dir = 'out'
debug = False
col_name = 'abstract'

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [5]:
ua = UserAgent()

def request_page(url):
    headers = {
        'User-Agent': ua.random,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'en-US,en;q=0.8',
        'Connection': 'keep-alive'
    }
    
    response = requests.get(url, headers=headers)
    
    return response

def extract_page(text):
    soup = BeautifulSoup(text, 'lxml')
    element = soup.find('abstract', {})
    
    if element is not None:
        return element.text.strip()
    
    return None

In [6]:
def extract(url, retry=False):
    with lock:
        global count
        if not retry:
            count = count + 1
        print(f'{count} / {len(df)}: requesting {url}')

        if debug:
            with open('result.html', 'w', encoding='utf-8') as file:
                file.write(response.text)
    
    response = request_page(url)
    text = extract_page(response.text)

    while text is None:
        print('retrying')
        return extract(url, retry=True)
        
    return text

In [7]:
for name, path in ((name, os.path.join(dir_path, name)) for name in os.listdir(dir_path)):
    print(f'reading csv file: {name}')
    
    df = pd.read_csv(path, skiprows=1)
    
    count = 0

    results = pool.map(extract, list(df['result link']))

    df[col_name] = results
    
    save_path = os.path.join(out_dir, name)
    print(f'saving to: {save_path}')
    df.to_csv(save_path, encoding='utf_8_sig')

reading csv file: gp-search-20220411-212156.csv
1 / 58: requesting https://patents.google.com/patent/US8980816B2/en
2 / 58: requesting https://patents.google.com/patent/US10993642B2/en
3 / 58: requesting https://patents.google.com/patent/US10494767B2/en
4 / 58: requesting https://patents.google.com/patent/US10076820B2/en
5 / 58: requesting https://patents.google.com/patent/US10888896B2/en
6 / 58: requesting https://patents.google.com/patent/US9504640B2/en
7 / 58: requesting https://patents.google.com/patent/US20180134717A1/en
8 / 58: requesting https://patents.google.com/patent/US9248140B2/en
9 / 58: requesting https://patents.google.com/patent/US10624963B2/en
10 / 58: requesting https://patents.google.com/patent/US8299056B2/en
11 / 58: requesting https://patents.google.com/patent/US20170197973A1/en
12 / 58: requesting https://patents.google.com/patent/US8765170B2/en
13 / 58: requesting https://patents.google.com/patent/USRE44695E1/en
14 / 58: requesting https://patents.google.com/pate

In [8]:
df

Unnamed: 0,id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link,abstract
0,US-8980816-B2,Fibrous structures comprising particles and me...,The Procter & Gamble Company,"Andreas Josef Dreher, Mark Robert Sivik, Alyss...",2012-01-04,2013-01-03,2015-03-17,2015-03-17,https://patents.google.com/patent/US8980816B2/en,https://patentimages.storage.googleapis.com/92...,Fibrous structures containing one or more part...
1,US-10993642-B2,Analyte sensor,"Dexcom, Inc.","Peter C. Simpson, James H. Brauker, Mark C. Br...",2004-07-13,2020-11-03,2021-05-04,2021-05-04,https://patents.google.com/patent/US10993642B2/en,https://patentimages.storage.googleapis.com/c9...,The present invention relates generally to sys...
2,US-10494767-B2,Fibrous structures including an active agent a...,The Procter & Gamble Company,"Paul Thomas Weisman, Hui Yang, Alrick Vincent ...",2013-12-09,2014-12-03,2019-12-03,2019-12-03,https://patents.google.com/patent/US10494767B2/en,https://patentimages.storage.googleapis.com/99...,The present disclosure relates to fibrous stru...
3,US-10076820-B2,Abrasive article having a non-uniform distribu...,"Saint-Gobain Abrasives, Inc., Saint-Gobain Abr...","Anuj Seth, Julie M. Dinh-Ngoc, Vivek CHERUVARI...",2011-12-31,2017-04-07,2018-09-18,2018-09-18,https://patents.google.com/patent/US10076820B2/en,https://patentimages.storage.googleapis.com/ed...,An abrasive article having a plurality of aper...
4,US-10888896-B2,Activation method using modifying agent,The Boeing Company,"Stuart Arthur Bateman, Ranya Simons, Dong Yang...",2005-01-21,2018-01-30,2021-01-12,2021-01-12,https://patents.google.com/patent/US10888896B2/en,https://patentimages.storage.googleapis.com/52...,The invention further relates to an activation...
5,US-9504640-B2,Modified release formulations of a bupropion s...,Valeant Pharmaceuticals Luxembourg S.Á.R.L.,"Werner Oberegger, Fang Zhou, Paul Maes, Stefan...",2005-06-27,2014-09-02,2016-11-29,2016-11-29,https://patents.google.com/patent/US9504640B2/en,https://patentimages.storage.googleapis.com/c0...,The present invention relates to pharmaceutica...
6,US-2018134717-A1,Pyrrolobenzodiazepines and conjugates thereof,Medimmune Limited,"Philip Wilson Howard, Luke Masterson, Arnaud T...",2010-04-15,2017-10-04,2018-05-17,,https://patents.google.com/patent/US2018013471...,https://patentimages.storage.googleapis.com/0c...,Conjugates and compounds for making conjugates...
7,US-9248140-B2,Chemical compounds,Astrazeneca Ab,"Robert Bruce DIEBOLD, Thomas Woodrow Gero, Pau...",2010-08-06,2015-02-18,2016-02-02,2016-02-02,https://patents.google.com/patent/US9248140B2/en,,The present invention relates to compounds of ...
8,US-10624963-B2,Multivalent PCV2 immunogenic compositions and ...,"Boehringer Ingelheim Vetmedica, Inc.","Michael B. Roof, Phillip Wayne Hayes, Marc All...",2005-12-29,2018-01-31,2020-04-21,2020-04-21,https://patents.google.com/patent/US10624963B2/en,https://patentimages.storage.googleapis.com/9a...,An improved method for recovering the protein ...
9,US-8299056-B2,"Aminotriazolopyridines, compositions thereof, ...","Signal Pharmaceuticals, Llc","Sogole Bahmanyar, R.J. Bates, Kate Blease, And...",2008-09-08,2009-09-08,2012-10-30,2012-10-30,https://patents.google.com/patent/US8299056B2/en,,Provided herein are Heteroaryl Compounds of fo...
