### Import and define functions to crawl pages

In [1]:
from selenium.webdriver.support.ui import WebDriverWait
import undetected_chromedriver as uc
import re, bs4, random, time
import pandas as pd

def get_rendered_html(url: str, timeout: int = 10) -> str:
    driver = uc.Chrome(headless=True, use_subprocess=False)

    try:
        driver.get(url)
        WebDriverWait(driver, timeout)
        html = driver.page_source
    finally:
        driver.quit()

    return html

def escape_bold(text: str) -> str:
    return re.sub(r'</?b>', '', text)

def build_url(page):
    # return f"https://scholar.google.com/scholar?start={(page-1)*10}&q=mamba%7C%22state+space%22+recommend%7Crecommender%7Crecommendation&hl=en&as_sdt=0,5"
    return f"https://scholar.google.com/scholar?start={(page-1)*20}&q=(mamba+OR+%22state+space%22+OR+%22state-space%22+OR+%22state+spaces%22+OR+%22state-spaces%22)+AND+(recommend+OR+recommender+OR+recommendation)&hl=en&as_sdt=0,5&as_ylo=2024&num=20"

### Fuctions to parse paper info

In [None]:
def get_paper_info(paper):
    title = paper.find('h3', class_="gs_rt").get_text(strip=True)
    authors = paper.find('div', class_="gs_a").get_text(strip=True)
    link = paper.find('h3', class_="gs_rt").find('a')['href'] if paper.find('h3', class_="gs_rt").find('a') else None
    abstract = paper.find('div', class_="gs_rs").get_text(strip=True) if paper.find('div', class_="gs_rs") else None
    citations = paper.find('div', class_="gs_fl").find_all('a')[2].get_text(strip=True) if len(paper.find('div', class_="gs_fl").find_all('a')) > 2 else None
    return (
        title,
        authors,
        link,
        abstract,
        citations
    )
    
def crawl_page(idx):
    random_delay = random.uniform(0, 1) * 10
    print(f"Waiting for {random_delay} seconds before crawling page {idx}... ")
    time.sleep(random_delay)
    return get_rendered_html(build_url(idx), timeout=10)

def parse_page(html_raw: str) -> list:
    soup = bs4.BeautifulSoup(escape_bold(html_raw), 'html.parser')
    papers = soup.find_all('div', id="gs_res_ccl_mid")[0].find_all('div', class_="gs_r gs_or gs_scl")
    print(f"done. found {len(papers)} papers.")
    return [get_paper_info(paper) for paper in papers]

### Crawl pages

In [3]:
data = [crawl_page(idx) for idx in range(1, 11)]

Waiting for 5.826511522821813 seconds before crawling page 1... 

IndexError: list index out of range

### Optional : manually visit pages and download html, then load into memory

In [3]:
def load_page(idx:int) -> str:
    with open(f'crawl-mamba-rec/page{idx}.html', 'r', encoding='utf-8') as file:
        return file.read()

htmls = [load_page(i) for i in range(1, 21)]
papers = []
for html in htmls:
    papers += parse_page(html)

done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.
done. found 20 papers.


In [8]:
pd.DataFrame(papers, columns=['Title', 'Authors', 'Link', 'Abstract', 'Citations']).to_csv('scholar_data.csv', index=False)