In [None]:
from google_scholar_py import CustomGoogleScholarProfiles
import json

parser = CustomGoogleScholarProfiles()
data = parser.scrape_google_scholar_profiles(
    query='blizzard',
    pagination=False,
    save_to_csv=False,
    save_to_json=False
)
print(json.dumps(data, indent=2))
# selenium-stealth webdriver-manager selectolax parsel pandas serpapi

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException, JavascriptException
from selenium_stealth import stealth

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "ko,en;q=0.9",
}


# ---------- Selenium ----------
def build_driver(headless=True, user_agent=None):
    opts = Options()
    
    opts.add_argument('--headless')
    opts.add_argument('--no-sandbox')
    opts.add_argument('--disable-dev-shm-usage')
    
    opts.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
    opts.add_experimental_option('useAutomationExtension', False)
    
    opts.page_load_strategy = "eager"
    opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1200,2400")
    opts.add_argument("--lang=ko-KR")

    # 리소스 최소화(이미지/CSS 차단)
    prefs = {
        "profile.managed_default_content_settings.images": 2,
        "profile.managed_default_content_settings.stylesheets": 2,
        "profile.managed_default_content_settings.cookies": 1,
        "profile.managed_default_content_settings.javascript": 1,
    }
    opts.add_experimental_option("prefs", prefs)
    if user_agent:
        opts.add_argument(f"--user-agent={user_agent}")
    driver = webdriver.Chrome(options=opts)
    try:
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
        })
    except Exception:
        pass
    driver.set_page_load_timeout(2)
    
    stealth(driver,
        languages=['en-US', 'en'],
        vendor='Google Inc.',
        platform='Win32',
        webgl_vendor='Intel Inc.',
        renderer='Intel Iris OpenGL Engine',
        fix_hairline=True
    )
    
    return driver

In [2]:
drv = build_driver()

In [7]:
import time
from bs4 import BeautifulSoup
import re

drv.get("https://scholar.google.com/scholar?q=model+kim&hl=en&as_sdt=0,5&start=40")
time.sleep(2.0)

html = drv.page_source
soup = BeautifulSoup(html, "html.parser")

In [18]:
soup.find_all("td")[-1].find("a")

<a href="/scholar?start=50&amp;q=model+kim&amp;hl=en&amp;as_sdt=0,5"><span class="gs_ico gs_ico_nav_next"></span><b style="display:block;margin-left:53px">Next</b></a>

In [None]:
name = soup.find("div", id="gsc_prf_in").text
topics = [at.text for at in soup.find("div", id="gsc_prf_int").find_all("a")]

cits = soup.find_all("td", class_=re.compile('gsc_rsb_std'))
total_citations = cits[0].text
total_citations_after_2020 = cits[1].text
h_index = cits[2].text
profile_text = soup.find_all("div", class_=re.compile("gsc_prf_il"))[0].text
personal_links = [{
    "name": at.text,
    "link": at['href']
} for at in soup.find_all("div", id=re.compile("gsc_prf_ivh"))[0].find_all("a")]

profile_image_src = soup.find("div", id='gsc_prf_pua').find('img')['src']

In [None]:
pub_lists = soup.find_all("tr", class_=re.compile("gsc_a_tr"))

pubs = []
for pub in pub_lists:
    atag = pub.find("a")
    pub_title = atag.text
    pub_link = atag['href']
    try:
        pub_year = atag.find("span", class_=re.compile("gsc_a_h")).text
    except:
        pub_year = None
    try:
        pub_citation_count = atag.find("a", class_=re.compile("gsc_a_ac")).text
    except:
        pub_citation_count = None

    pubs.append({
        "title": pub_title,
        "link": "https://scholar.google.com" + pub_link,
        "citation_count": pub_citation_count
    })

In [None]:
cos = soup.find_all("div", class_=re.compile("gsc_rsb_aa"))

In [None]:
cos[0].text

In [None]:
import time
from bs4 import BeautifulSoup
import re

drv.get("https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:deep_learning")
time.sleep(6.0)

html = drv.page_source
soup = BeautifulSoup(html, "html.parser")

In [None]:
soup.find_all("li", class_=re.compile("qKvP1b"))

In [None]:
import time
from bs4 import BeautifulSoup
import re

drv.get("https://github.com/thxxx?tab=repositories")

time.sleep(2.0)

html = drv.page_source
soup = BeautifulSoup(html, "html.parser")

import re

repos = soup.find_all(attrs={"data-filterable-for": "your-repos-filter"})[0].find_all("li")
for r in repos:
    print(r.find("h3").text)