In [3]:
from bs4 import BeautifulSoup
import requests, re, random, json, math

PAGE_SIZE = 10
USER_AGENT_LIST = [
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]

In [4]:
def get_user_agent():
    for _ in USER_AGENT_LIST:
        #Pick a random user agent
        user_agent = random.choice(USER_AGENT_LIST)
        
    return user_agent

In [5]:
def scrape_number_pages(query:str, since:int=None, is_review:bool=False) -> int:
    #Set the headers 
    headers = { 'User-Agent': get_user_agent() }
    
    params = {
        'q': query,  # search query
        'hl': 'en',       # language of the search
        'as_ylo': since, 
        'as_rr': int(is_review)
    }
    html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params).text
    n_results = int(re.search(r'About (.*?) results', html).group(1))

    return math.ceil(n_results/10)

In [7]:
def scraper_google_scholar(query:str, since:int=None, is_review:bool=False) -> list:

    n_pages = scrape_number_pages(query=query, since=since, is_review=is_review)

    data = []

    for page in range(n_pages): 
        headers = { 'User-agent': get_user_agent() }
        params = {
            'start': page * PAGE_SIZE,
            'q': query,  # search query
            'hl': 'en',       # language of the search
            'as_ylo': since, 
            'as_rr': int(is_review)
        }
        html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params).text
        soup = BeautifulSoup(html, 'lxml')

        # Container where all needed data is located
        for result in soup.select('.gs_r.gs_or.gs_scl'):
            title = result.select_one('.gs_rt').text
            try:
                title_link = result.select_one('.gs_rt a')['href']
            except:
                title_link = None
            publication_info = result.select_one('.gs_a').text
            snippet = result.select_one('.gs_rs').text
            cited_by = result.select_one('#gs_res_ccl_mid .gs_nph+ a')['href']
            try:
                pdf_link = result.select_one('.gs_or_ggsm a:nth-child(1)')['href']
            except:
                pdf_link = None
                
            data.append({
                'title': title,
                'title_link': title_link,
                'publication_info': publication_info,
                'snippet': snippet,
                'cited_by': f'https://scholar.google.com{cited_by}',
                "pdf_link": pdf_link
            })
    return data

papers = scraper_google_scholar(
    query='~chest ~caption "report generation" "x ray" -intitle:segmentation -intitle:classification -"case study" -"case report"',
    since=2022, is_review=False
)

In [11]:
open("./papers.json", "w+").write(json.dumps(papers, indent=4, ensure_ascii='utf-8'))

148672