In [21]:
import time
from bs4 import BeautifulSoup
import requests, re, random, math, datetime
import pandas as pd
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

PAGE_SIZE = 10
USER_AGENT_LIST = [
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
URL = 'https://scholar.google.com/scholar'
REQUEST_INTERVAL = 5 #how many saconds you'll wait in between requests to avoid Google's bot detector
CURRENT_YEAR = datetime.date.today().year

In [22]:
def get_user_agent():
    for _ in USER_AGENT_LIST:
        #Pick a random user agent
        user_agent = random.choice(USER_AGENT_LIST)
        
    return user_agent

def try_or(func, default=None, expected_exc=(Exception,)):
    try:
        return func()
    except expected_exc:
        return default
    
def extract_publication_info(publication_info):
    splits = publication_info.split("- ")
    authors = splits[0].strip()
    venue, year, publisher = None, None, None
    if len(splits) > 1:
        if "," in splits[1]:
            venue = ", ".join(splits[1].split(", ")[:-1])
            year = int(splits[1].split(", ")[-1].strip())
        publisher = splits[-1].strip()
    return authors, publisher, venue, year

def get_number_pages(query:str, start_date:int=None, end_date:int=None, is_review:bool=False) -> int:
    #Set the headers 
    headers = { 'User-Agent': get_user_agent() }
    
    params = {
        'q': query,             # search query
        'hl': 'en',             # Language
        'as_ylo': start_date,   # Custom range... start date
        'as_yhi': end_date,     # Custom range... end date
        'as_rr': int(is_review) # Whether or not to filter reviews/surveys
    }
    html = requests.get(URL, headers=headers, params=params).text
        
    n_results = try_or(
        lambda: int(re.search(r'About (.*?) results', html).group(1)),
        default=-1, expected_exc=(Exception,)
    )

    n_pages = math.ceil(n_results/PAGE_SIZE) if n_results > 0 else n_results

    return n_pages

In [23]:
def scraper_google_scholar(query:str, date_start:int=None, date_end:int=None, is_review:bool=False) -> pd.DataFrame:

    n_pages = get_number_pages(query=query, start_date=date_start, end_date=date_end, is_review=is_review)
    
    if n_pages == -1:
        print("You got caught! Too many request...")
        raise pd.errors.EmptyDataError
    
    data = []
    update_rate = 100/n_pages
    with tqdm(total=100) as pbar:
        for page in range(n_pages): 
            headers = { 'User-agent': get_user_agent() }
            params = {
                'start': page * PAGE_SIZE,
                'q': query,  # search query
                'hl': 'en',  # language of the search
                'as_ylo': date_start, 
                'as_yhi': date_end,  
                'as_rr': int(is_review)
            }
            html = requests.get(URL, headers=headers, params=params).text
            soup = BeautifulSoup(html, 'lxml')

            # Container where all needed data is located
            for result in soup.select('.gs_r.gs_or.gs_scl'):
                title = result.select_one('.gs_rt').text
                publication_info = result.select_one('.gs_a').text
                authors, publisher, venue, year = extract_publication_info(publication_info)
                snippet = result.select_one('.gs_rs').text
                cited_by = result.select_one('#gs_res_ccl_mid .gs_nph+ a')['href']
                pdf_link = try_or(
                    lambda: result.select_one('.gs_or_ggsm a:nth-child(1)')['href'],
                    default=None, expected_exc=(Exception,)
                )
                title_link = try_or(
                    lambda: result.select_one('.gs_rt a')['href'],
                    default=None, expected_exc=(Exception,)
                )
                n_citations = try_or(
                    lambda: int(result.find(lambda tag:tag.name=="a" and "Cited by" in tag.text).text.replace("Cited by ", "")),
                    default=0, expected_exc=(Exception,)
                )
                cit_per_year = try_or(
                    lambda: math.floor(n_citations/(1 + (CURRENT_YEAR - year))),
                    default=n_citations, expected_exc=(Exception,)
                )
                data.append({
                    'title': title,
                    'title_link': title_link,
                    'publisher': publisher,
                    'venue': venue,
                    'authors': authors,
                    'publication_info': publication_info,
                    'snippet': snippet,
                    'cited_by': f'https://scholar.google.com{cited_by}',
                    "pdf_link": pdf_link,
                    'n_citations': n_citations,
                    'year': year,
                    'cit/year': cit_per_year
                })
            time.sleep(REQUEST_INTERVAL)  # Avoiding to many queries in a small time window.
            pbar.set_description("Finding papers (page %s)" % (page + 1))
            pbar.update(update_rate)

    df = pd.DataFrame.from_dict(data)
    df = df.sort_values(by="n_citations", ascending=False, ignore_index=True)
    df["year"] = df["year"].astype("Int64")
    return df

In [24]:
df_papers = scraper_google_scholar(
    query='~chest ~caption "report generation" "x ray" -intitle:segmentation -intitle:classification -"case study" -"case report"',
    date_start=2018, date_end=2023, is_review=False
)
df_papers

  0%|          | 0/39 [00:00<?, ?it/s]

Unnamed: 0,title,title_link,publisher,vanue,authors,publication_info,snippet,cited_by,pdf_link,n_citations,year,cit/year
0,Artificial intelligence in radiology,https://www.nature.com/articles/s41568-018-0016-5,nature.com,Nature Reviews …,"A Hosny, C Parmar, J Quackenbush…","A Hosny, C Parmar, J Quackenbush… - Nature Rev...",… As the report generation task falls towards ...,https://scholar.google.com/scholar?cites=49447...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,2025,2018,337
1,Canadian Association of Radiologists white pap...,https://journals.sagepub.com/doi/abs/10.1016/j...,journals.sagepub.com,Canadian …,"A Tang, R Tam, A Cadrin-Chênevert…","A Tang, R Tam, A Cadrin-Chênevert… - Canadian ...","… in image recognition, caption generation, an...",https://scholar.google.com/scholar?cites=13474...,https://journals.sagepub.com/doi/full/10.1016/...,399,2018,66
2,Contrastive learning of medical visual represe...,https://proceedings.mlr.press/v182/zhang22a.html,proceedings.mlr.press,Machine Learning …,"Y Zhang, H Jiang, Y Miura…","Y Zhang, H Jiang, Y Miura… - Machine Learning ...",… is similar to that of the Caption-Transforme...,https://scholar.google.com/scholar?cites=88442...,https://proceedings.mlr.press/v182/zhang22a/zh...,302,2022,151
3,Hybrid retrieval-generation reinforced agent f...,https://proceedings.neurips.cc/paper/2018/hash...,proceedings.neurips.cc,Advances in neural …,"Y Li, X Liang, Z Hu, EP Xing","Y Li, X Liang, Z Hu, EP Xing - Advances in neu...",… report generation. The middle column is a re...,https://scholar.google.com/scholar?cites=17238...,https://proceedings.neurips.cc/paper/2018/file...,231,2018,38
4,Clinically accurate chest x-ray report generation,http://proceedings.mlr.press/v106/liu19a.html,proceedings.mlr.press,Machine Learning …,"G Liu, TMH Hsu, M McDermott…","G Liu, TMH Hsu, M McDermott… - Machine Learnin...","… In this work, we present a domain-aware auto...",https://scholar.google.com/scholar?cites=12820...,http://proceedings.mlr.press/v106/liu19a/liu19...,179,2019,35
...,...,...,...,...,...,...,...,...,...,...,...,...
325,Towards Automated Healthcare: Deep Vision and ...,https://dash.harvard.edu/handle/1/37376413,dash.harvard.edu,,K Tian,K Tian - 2023 - dash.harvard.edu,… with one or more chest X-ray images taken fr...,https://scholar.google.com/scholar?cluster=361...,https://dash.harvard.edu/bitstream/handle/1/37...,0,,0
326,[HTML][HTML] AI-based radiodiagnosis using che...,https://www.frontiersin.org/articles/10.3389/f...,frontiersin.org,Frontiers in Big Data,"Y Akhter, R Singh, M Vatsa","Y Akhter, R Singh, M Vatsa - Frontiers in Big ...","… , where an X-ray beam passes the patient che...",https://scholar.google.com/scholar?q=related:M...,https://www.frontiersin.org/articles/10.3389/f...,0,2023,0
327,[PDF][PDF] A multimodal approach to automated ...,https://ceur-ws.org/Vol-3307/paper2.pdf,ceur-ws.org,,"G Leonardi, L Portinale, A Santomauro","G Leonardi, L Portinale, A Santomauro - 2022 -...","… text generation from images (eg, image capti...",https://scholar.google.com/scholar?q=related:D...,https://ceur-ws.org/Vol-3307/paper2.pdf,0,,0
328,Evaluating GPT4 on Impressions Generation in R...,https://pubs.rsna.org/doi/full/10.1148/radiol....,pubs.rsna.org,Radiology,"Z Sun, H Ong, P Kennedy, L Tang, S Chen, J Elias…","Z Sun, H Ong, P Kennedy, L Tang, S Chen, J Eli...",… residents using the chest X-ray randomly pic...,https://scholar.google.com/scholar?cluster=153...,,0,2023,0


In [26]:
df_papers.to_json("./papers.json")