# Search for papers

Here I record the steps I took to fetch publications from Google Scholar search results. The process consists of largely two steps:

1. Obtain HTML files of search result pages
2. Parse the HTML file and form a tabular dataset

In [1]:
from selenium.webdriver.support.ui import WebDriverWait
import undetected_chromedriver as uc
import re, bs4, random, time
import pandas as pd

### 1-A. Crawl pages to get HTMLs

> `WARNING` : It is recommended to visit Google Scholar search pages and download htmls manually, as stated in `1-B`. Automated methods will likely fail.

In [2]:
def get_rendered_html(url: str, timeout: int = 10) -> str:
    '''
    visit a page, wait until it is fully loaded, and return the HTML source
    '''
    driver = uc.Chrome(headless=True, use_subprocess=False)

    try:
        driver.get(url)
        WebDriverWait(driver, timeout)
        html = driver.page_source
    finally:
        driver.quit()

    return html

def escape_bold(text: str) -> str:
    '''
    remove <b></b> tags from the text
    '''
    return re.sub(r'</?b>', '', text)

def build_url(page):
    '''
    returns url for a google scholar search page, given a page number
    search term is `(mamba OR "state space" OR "state-space" OR "state-spaces") AND (recomend OR recommender OR recommendation)`
    '''
    return f"https://scholar.google.com/scholar?start={(page-1)*20}&q=(mamba+OR+%22state+space%22+OR+%22state-space%22+OR+%22state+spaces%22+OR+%22state-spaces%22)+AND+(recommend+OR+recommender+OR+recommendation)&hl=en&as_sdt=0,5&as_ylo=2024&num=20"

def crawl_page(idx):
    '''
    crawl a page and return its HTML source
    '''
    random_delay = random.uniform(0, 1) * 10
    print(f"Waiting for {random_delay} seconds before crawling page {idx}... ")
    time.sleep(random_delay)
    return get_rendered_html(build_url(idx), timeout=10)

In [None]:
# This line will likely fail. Use the manual method instead.
htmls = [crawl_page(idx) for idx in range(1, 11)]

### 1-B. Manually load HTMLs into memory

This is a much more reliable way to fetch search results.

In [3]:
def load_page(idx:int, dirname:str) -> str:
    '''
    Loads a single HTML file stored in `dirname` with names `page{idx}.html`
    '''
    with open(f'{dirname}/page{idx}.html', 'r', encoding='utf-8') as file:
        return file.read()

pages_crawl = [load_page(i, 'google-scholar-crawl') for i in range(1, 21)]
pages_citations = [load_page(i, 'google-scholar-citations') for i in range(1, 21)]

### 2. Parse HTML to tabular data and save as file

In [4]:
def get_paper_info(paper):
    title = paper.find('h3', class_="gs_rt").get_text(strip=True)
    authors = paper.find('div', class_="gs_a").get_text(strip=True)
    link = paper.find('h3', class_="gs_rt").find('a')['href'] if paper.find('h3', class_="gs_rt").find('a') else None
    abstract = paper.find('div', class_="gs_rs").get_text(strip=True) if paper.find('div', class_="gs_rs") else None
    citations = paper.find('div', class_="gs_fl").find_all('a')[2].get_text(strip=True) if len(paper.find('div', class_="gs_fl").find_all('a')) > 2 else None
    return (
        title,
        authors,
        link,
        abstract,
        citations
    )
    
def parse_page(html_raw: str) -> list:
    soup = bs4.BeautifulSoup(escape_bold(html_raw), 'html.parser')
    papers = soup.find_all('div', id="gs_res_ccl_mid")[0].find_all('div', class_="gs_r gs_or gs_scl")
    print(f"done. found {len(papers)} papers.")
    return [get_paper_info(paper) for paper in papers]

##############################################################################
# Search A
##############################################################################

papers_crawl = []
for html in pages_crawl:
    papers_crawl += parse_page(html)

df_crawl = pd.DataFrame(papers_crawl, columns=['Title', 'Authors', 'Link', 'Abstract', 'Citations'])
df_crawl.to_csv('processed-data/scholar-data-crawl.csv', index=False)
print('Tabularized data saved to "processed-data/scholar-data-crawl.csv".')

##############################################################################
# Search B
##############################################################################

papers_citations = []
for html in pages_citations:
    papers_citations += parse_page(html)
    
df_citations = pd.DataFrame(papers_citations, columns=['Title', 'Authors', 'Link', 'Abstract', 'Citations'])
df_citations.to_csv('processed-data/scholar-data-citations.csv', index=False)
print('Tabularized data saved to "processed-data/scholar-data-citations.csv".')

done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
Tabularized data saved to "processed-data/scholar-data-crawl.csv".
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
Tabularized d