In [None]:
import time
import re
import random

keywords = [
    "model",
    "generative",
    "llm",
    "multimodal",
    "inference",
    "benchmark",
    "image",
    "audio",
    "tts",
    "diffusion",
    "flow matching",
    "adversarial",
    "network",
    "representation",
    "training",
    "loss",
    "sampling",
    "agent",
    "foundation",
    "recommendation",
    "robust",
    "cnn",
    "learning",
    "rl",
    "detection",
    "retrieval",
    "denoising",
    "language",
    "video",
    "speech",
    "reasoning",
    "policy",
    "attention",
    "supervised",
    "autoregressive",
    "speech recognition",
    "synthesis",
    "probability",
]

surnames = [
  { "surname": "Kim",   "ratio_percent": 13.5 },
  { "surname": "Lee",   "ratio_percent": 8.5 },
  { "surname": "Park",  "ratio_percent": 5.5 },
  { "surname": "Jung",  "ratio_percent": 2.5 },
  { "surname": "Jeong",  "ratio_percent": 1.5 },
  { "surname": "Choi",  "ratio_percent": 3.5 },
  { "surname": "Cho",   "ratio_percent": 2.5 },
  { "surname": "Kang",  "ratio_percent": 2.5 },
  { "surname": "Yoon",  "ratio_percent": 2.5 },
  { "surname": "Lim",   "ratio_percent": 1.5 },
  { "surname": "Jang",  "ratio_percent": 1.5 },
  { "surname": "Han",   "ratio_percent": 1.5 },
  { "surname": "Oh",    "ratio_percent": 1.5 },
  { "surname": "Seo",   "ratio_percent": 1.5 },
  { "surname": "Shin",  "ratio_percent": 1.5 },
  { "surname": "Kwon",  "ratio_percent": 1.5 },
  { "surname": "Hwang", "ratio_percent": 1.5 },
  { "surname": "Ahn",   "ratio_percent": 1.5 },
  { "surname": "Song",  "ratio_percent": 1.5 },
  { "surname": "Ryu",   "ratio_percent": 1.5 },
  { "surname": "Hong",  "ratio_percent": 1.5 },
  { "surname": "Bae",  "ratio_percent": 1.0 },
  { "surname": "Woo",  "ratio_percent": 1.0 },
  { "surname": "Yun",  "ratio_percent": 1.0 },
  { "surname": "Son",  "ratio_percent": 1.0 },
]

def sample_hap():
    names = [item["surname"] for item in surnames]
    weights = [item["ratio_percent"] for item in surnames]
    return [random.choices(keywords, k=1)[0].lower(), random.choices(names, weights=weights, k=1)[0].lower()]

In [23]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException, JavascriptException
from selenium_stealth import stealth
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "ko,en;q=0.9",
}


# ---------- Selenium ----------
def build_driver(headless=True, user_agent=None):
    opts = Options()
    
    opts.add_argument('--headless')
    opts.add_argument('--no-sandbox')
    opts.add_argument('--disable-dev-shm-usage')
    
    opts.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
    opts.add_experimental_option('useAutomationExtension', False)
    
    opts.page_load_strategy = "eager"
    opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1200,2400")
    opts.add_argument("--lang=ko-KR")

    # 리소스 최소화(이미지/CSS 차단)
    prefs = {
        "profile.managed_default_content_settings.images": 2,
        "profile.managed_default_content_settings.stylesheets": 2,
        "profile.managed_default_content_settings.cookies": 1,
        "profile.managed_default_content_settings.javascript": 1,
    }
    opts.add_experimental_option("prefs", prefs)
    if user_agent:
        opts.add_argument(f"--user-agent={user_agent}")
    driver = webdriver.Chrome(options=opts)
    try:
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": "Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
        })
    except Exception:
        pass
    driver.set_page_load_timeout(2)
    
    stealth(driver,
        languages=['en-US', 'en'],
        vendor='Google Inc.',
        platform='Win32',
        webgl_vendor='Intel Inc.',
        renderer='Intel Iris OpenGL Engine',
        fix_hairline=True
    )
    
    return driver

ModuleNotFoundError: No module named 'selenium'

In [None]:
drv = build_driver()

In [None]:
from collections import defaultdict
import pandas as pd
import csv

author_list = defaultdict(list)
paper_list = defaultdict(list)
searched_combs = []

pds = pd.read_csv("output.csv").to_dict(orient="records")
datas = pds

def get_titles_and_author_ids(bsoup):
    papers = bsoup.find_all("div", class_=re.compile("gs_scl"))
    for paper in papers:
        try:
            title = paper.find("h3", class_=re.compile("gs_rt")).text
            author_ids = [re.sub('&hl=en&oi=sra', '', a['href']) for a in paper.find("div", class_=re.compile("gs_a")).find_all("a")]
            author_names = [a.text for a in paper.find("div", class_=re.compile("gs_a")).find_all("a")]
            matches = re.findall(r"20\d{2}", paper.find("div", class_=re.compile("gs_a")).text)
            paper_year = matches[-1]
    
            ctns = paper.find("div", class_=re.compile("gs_flb")).find_all("a")
            if len(ctns)>2:
                citation_nums = ctns[2].text
                if "Related" not in citation_nums:
                    citation_nums = citation_nums[len("Cited by "):]
                else:
                    citation_nums = "0"
            else:
                citation_nums = "0"
    
            for ai in range(len(author_ids)):
                author_list[author_ids[ai]].append(title)
                datas.append({
                    "title": title,
                    "author_id": author_ids[ai],
                    "author_names": author_names[ai],
                    "paper_year": int(paper_year) if paper_year else None,
                    "citation_nums": int(citation_nums)
                })
        except Exception as e:
            print("In paper error ", e)
            return False
    return True

for i in range(100):
    rdns = sample_hap()
    k = rdns[0]
    n = rdns[1]

    indexes = random.sample([b for b in range(50)], k=15)
    for idx in indexes:
        searched_combs.append(f"{k}_{n}_{idx}")
        pn = idx*10
        try:
            drv.get(f"https://scholar.google.com/scholar?start={pn}?hl=en&as_sdt=0%2C5&as_ylo=2020&q={k}+{n}&btnG=")
            time.sleep(5.0)
            
            html = drv.page_source
            soup = BeautifulSoup(html, "html.parser")

            assert "solving the above CAPTCHA" not in soup.text
        except Exception as e:
            print("error ", e)
            time.sleep(300.0)
            continue
        is_ok = get_titles_and_author_ids(soup)
        print("Data lens : ", len(datas))
        df = pd.DataFrame(datas)
        df.to_csv("output.csv", index=False)
        sdf = pd.DataFrame(searched_combs)
        sdf.to_csv("searched_combs.csv", index=False)
        if not is_ok:
            time.sleep(300.0)
        time.sleep(15.0)

In [None]:
papers = soup.find_all("div", class_=re.compile("gs_scl"))
print(len(papers))
for paper in papers:
    title = paper.find("h3", class_=re.compile("gs_rt")).text
    author_ids = [re.sub('&hl=en&oi=sra', '', a['href']) for a in paper.find("div", class_=re.compile("gs_a")).find_all("a")]
    print(title, author_ids)

In [None]:
import pandas as pd

pds = pd.read_csv("output.csv")
pds

In [None]:
from serpapi import GoogleSearch

params = {
  "engine": "google_scholar_author",
  "author_id": "P2oSOR0AAAAJ",
  "api_key": "",
    "sort": "pub_date"
}

search = GoogleSearch(params)
results = search.get_dict()
author = results["author"]

In [None]:
import os
import requests

API_KEY = ""
author_ids = ['FDG3_JMAAAAJ', 'tEyBgFQAAAAJ', '-ZJaGikAAAAJ', 'N2S3jFcAAAAJ', 'eGj3ay4AAAAJ']

params = {
    "engine": "google_scholar_author",
    "author_id": 'tEyBgFQAAAAJ',
    "hl": "en",
    "sort": "pubdate",  # pub_date 대신 pubdate (docs 기준)
    "start": 0,
    "api_key": API_KEY,
}

resp = requests.get("https://serpapi.com/search.json", params=params)
data = resp.json()



In [None]:
!python3 -m pip install pandas

In [None]:
import requests
import re
import pandas as pd
import random
import os
import time

api_key = ""
url = "https://api.scrapingdog.com/google_scholar"

datas = []

In [None]:
for count in range(400):
    rdns = sample_hap()
    k = rdns[0]
    n = rdns[1]
    indexes = random.sample([b for b in range(100)], k=5)

    for page in indexes:
        try:
            params = {
                "api_key": api_key,
                "query": f"{k} {n}",
                "results": "20",
                "page": page,
                "language": "en",
                "lr": "lang_en",
                "as_ylo": "2019",
                "as_yhi": "2026",
                "scisbd": False
            }

            response = requests.get(url, params=params)

            if response.status_code == 200:
                data = response.json()
            else:
                print(f"Request failed with status code: {response.status_code}")

            for sc in data['scholar_results']:
                try:
                    title = sc['title']
                    author_ids = [a['author_id'] for a in sc['authors']]
                    author_names = [a['name'] for a in sc['authors']]
                    matches = re.findall(r"20\d{2}", sc['displayed_link'])
                    paper_year = matches[-1]

                    citation_nums = sc['inline_links']['cited_by']['total']
                    if "Related" not in citation_nums:
                        citation_nums = citation_nums[len("Cited by "):]
                    else:
                        citation_nums = "0"

                    for ai in range(len(author_ids)):
                        datas.append({
                            "title": title,
                            "author_id": author_ids[ai],
                            "author_names": author_names[ai],
                            "paper_year": int(paper_year) if paper_year else None,
                            "citation_nums": int(citation_nums)
                        })
                except Exception as e:
                    print(f"[{k} {n} {page}] In paper error ", e)
                    continue
            
            print(f"[{k} {n} {page}] Data lens : ", len(datas))
            df = pd.DataFrame(datas)
            df.to_csv("output_onlydog.csv", index=False)
        except Exception as e:
            print(f"[{k} {n} {page}] In Out error ", e)
            continue
        

In [None]:
import requests
import requests
import re
import pandas as pd
import random
import os
import time

api_key = ""
url = "https://api.scrapingdog.com/google_scholar/profiles"
# url = "https://api.scrapingdog.com/google_scholar"

params = {
    "api_key": api_key,
    "mauthors": "snu",
    "results": "20",
    "page": 1,
    "language": "en",
    "lr": "lang_en",
}

response = requests.get(url, params=params)
# params = {
#     "api_key": api_key,
#     "mauthors": "snu"
# }

# response = requests.get(url, params=params)

if response.status_code == 200:
    data = response.json()
    print(data)
else:
    print(f"Request failed with status code: {response.status_code}")


In [None]:
surnames = [
    "Kang", "Ko", "Gwak", "Gu", "Guk", "Kwon", "Keum", "Ki",
    "Na", "Nam", "Namgung", "Noh", "Non", "Dan", "Dam", "Dang",
    "Do", "Dokgo", "Dongbang", "Dong", "Du", "Ra", "Ryeong",
    "Ryu", "Ryuk", "Ri",
    "Ma", "Man", "Maeng", "Myeong", "Mo", "Mok", "Muk", "Moon",
    "Min", "Park", "Ban", "Bang", "Bae", "Baek", "Beom", "Byun",
    "Bok", "Bong", "Boo", "Bi", "Bin",
    "Sa", "Sam", "Sang", "Seo", "Seomun", "Seon", "Seonwoo", "Sung",
    "So", "Son", "Song", "Su", "Seung", "Si", "Shin", "Sim",
    "A", "Ahn", "Ae", "Yang", "Eo", "Eom", "Yeo", "Yeon", "Yeom",
    "Young", "Ye", "Oh", "Ok", "On", "Ong", "Wang", "Yo", "Yong",
    "Woo", "Won", "Wi", "Yu", "Yuk", "Yoon", "Eun", "Eum",
    "Lee", 
    "In", "Lim", "Jang", "Jeon", "Jeol", "Jung", "Je", "Jegal",
    "Jo", "Jwa", "Joo", "Juk", "Jun", "Ji", "Jin",
    "Cha", "Chae", "Cheo", "Cheon", "Cho", "Choi",
    "Chu", "Tak", "Tan", "Tang", "Tae",
    "Ha", "Hak", "Han", "Ham", "Heo", "Hyun", "Hyeong", "Ho",
    "Hong", "Hwa", "Hwang", "Hwangbo", "Hu", "Heung"
]

import pandas as pd

df = pd.read_csv("output_onlydog.csv")
print(len(df))

pattern = "|".join(surnames)

duplicated_title = df[df.duplicated(subset=['title'])]
print(len(duplicated_title))

duplicated = df[df.duplicated(subset=['author_id'])]
print(len(duplicated))

print(len(df))
filtered = duplicated[duplicated["author_names"].str.contains(pattern, case=False, na=False)]
print(len(filtered))

In [None]:
import requests

api_key = ""
url = "https://api.scrapingdog.com/google_scholar/author"

detail_profiles = []
extracted_authors = []

for aidx, author_id in tqdm(enumerate(author_ids)):
    params = {
        "api_key": api_key,
        "author_id": author_id,
        "page": 0,
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
    else:
        print(f"Request failed with status code: {response.status_code}")
    
    author = data['author']

    input_data = {
        "author_id": "author_id",
        "name": author['name'],
        "affiliations": author['affiliations'],
        "email": author['email'],
        "interests": [a['title'] for a in author['interests']],
        "image_thumbnail": author['thumbnail'],
        "articles": [{
            "title": d['title'],
            "citation_id": d['citation_id'],
            "publication": d['publication'],
            "citation_count": d['cited_by']['value'],
            "year": int(d['year']),
        } for d in data['articles']],
        "total_citation_count": data['cited_by']['table'][0]['citations']['all'],
        "since_2020_citation_count": data['cited_by']['table'][0]['citations']['since_2020'],
        "h_index": data['cited_by']['table'][1]['h_index']['all'],
    }
    detail_profiles.append(input_data)

    if len(data['co_authors']) > 0:
        for co_author in data['co_authors']:
            author_input_data = {
                "author_id": co_author['author_id'],
                "author_names": co_author['name'],
                "affiliations": co_author['affiliations'],
            }
            extracted_authors.append(author_input_data)
    
    if aidx % 50 == 49:
        pd.DataFrame(detail_profiles).to_csv(f"detail_profiles_{aidx}.csv", index=False)
        pd.DataFrame(extracted_authors).to_csv(f"extracted_authors_{aidx}.csv", index=False)

In [None]:

params = {
    "api_key": api_key,
    "author_id": "Gz8lsIwAAAAJ",
    "page": 20,
}

response = requests.get(url, params=params)

if response.status_code == 200:
    data = response.json()
else:
    print(f"Request failed with status code: {response.status_code}")



In [None]:
surnames = [
    "Kang", "Ko", "Gwak", "Gu", "Guk", "Kwon", "Keum", "Ki",
    "Na", "Nam", "Namgung", "Noh", "Non", "Dan", "Dam", "Dang",
    "Do", "Dokgo", "Dongbang", "Dong", "Du", "Ra", "Ryeong",
    "Ryu", "Ryuk", "Ri",
    "Ma", "Man", "Maeng", "Myeong", "Mo", "Mok", "Muk", "Moon",
    "Min", "Park", "Ban", "Bang", "Bae", "Baek", "Beom", "Byun",
    "Bok", "Bong", "Boo", "Bi", "Bin",
    "Sa", "Sam", "Sang", "Seo", "Seomun", "Seon", "Seonwoo", "Sung",
    "So", "Son", "Song", "Su", "Seung", "Si", "Shin", "Sim",
    "A", "Ahn", "Ae", "Yang", "Eo", "Eom", "Yeo", "Yeon", "Yeom",
    "Young", "Ye", "Oh", "Ok", "On", "Ong", "Wang", "Yo", "Yong",
    "Woo", "Won", "Wi", "Yu", "Yuk", "Yoon", "Eun", "Eum",
    "Lee", 
    "In", "Lim", "Jang", "Jeon", "Jeol", "Jung", "Je", "Jegal",
    "Jo", "Jwa", "Joo", "Juk", "Jun", "Ji", "Jin",
    "Cha", "Chae", "Cheo", "Cheon", "Cho", "Choi",
    "Chu", "Tak", "Tan", "Tang", "Tae",
    "Ha", "Hak", "Han", "Ham", "Heo", "Hyun", "Hyeong", "Ho",
    "Hong", "Hwa", "Hwang", "Hwangbo", "Hu", "Heung"
]

import pandas as pd

df = pd.read_csv("output_onlydog.csv")
print(len(df))

pattern = "|".join(surnames)

duplicated_title = df[df.duplicated(subset=['title'])]
print(len(duplicated_title))

duplicated = df[df.duplicated(subset=['author_id'])]
print(len(duplicated))

filtered = duplicated[duplicated["author_names"].str.contains(pattern, case=False, na=False)]
print(len(filtered))

author_ids = filtered['author_id'].tolist()

def save_data(aidx, detail_profiles, extracted_authors):
    pd.DataFrame(detail_profiles).to_csv(f"detail_profiles.csv", index=False)
    pd.DataFrame(extracted_authors).to_csv(f"extracted_authors.csv", index=False)

    with open(f"detail_profiles.json", "w") as f:
        json.dump(detail_profiles, f, indent=4)

    with open(f"extracted_authors.json", "w") as f:
        json.dump(extracted_authors, f, indent=4)
    
    print(f"Saved {len(detail_profiles)} detail_profiles and {len(extracted_authors)} extracted_authors")

In [None]:
import requests
from tqdm import tqdm
import pandas as pd
import json

api_key = ""
url = "https://api.scrapingdog.com/google_scholar/author"

detail_profiles = []
extracted_authors = []
used_author_ids = []

for aidx, author_id in tqdm(enumerate(author_ids[:500])):
    # -------- 1) 첫 페이지 요청 (page = 0) --------
    params = {
        "api_key": api_key,
        "author_id": author_id,
        "page": 0,
    }

    response = requests.get(url, params=params)

    if response.status_code != 200:
        print(f"[author {author_id}] Request failed with status code: {response.status_code}")
        continue

    data = response.json()
    first_data = data  # co_authors, cited_by, author 정보는 첫 페이지 기준으로 사용
    all_articles = data.get("articles", []).copy()

    # -------- 2) articles 길이가 20이면 page를 20씩 늘려가며 추가 수집 --------
    # page: 20, 40, 60, 80, 100 까지 시도
    if len(data.get("articles", [])) == 20:
        for page in range(20, 101, 20):
            params = {
                "api_key": api_key,
                "author_id": author_id,
                "page": page,
            }

            response = requests.get(url, params=params)
            if response.status_code != 200:
                print(f"[author {author_id}] (page {page}) Request failed with status code: {response.status_code}")
                break

            page_data = response.json()
            page_articles = page_data.get("articles", [])

            # articles 말고는 아무것도 안 쓰고, 기존 all_articles에만 이어붙임
            if not page_articles:
                # 더 이상 가져올 게 없으면 중단
                break

            all_articles.extend(page_articles)

            # 이 페이지에서 20개 미만이면 더 이상 다음 page 안 감
            if len(page_articles) < 20:
                break

    # -------- 3) 첫 페이지의 author / citation / co_author + 모든 articles로 최종 구조 생성 --------
    author = first_data["author"]

    input_data = {
        "author_id": author_id,  # 원래 코드에 "author_id" 문자열이 들어가 있던 부분 수정
        "name": author.get("name"),
        "affiliations": author.get("affiliations"),
        "email": author.get("email"),
        "interests": [a["title"] for a in author.get("interests", [])],
        "image_thumbnail": author.get("thumbnail"),
        "articles": [
            {
                "title": d.get("title"),
                "citation_id": d.get("citation_id"),
                "publication": d.get("publication"),
                "citation_count": d.get("cited_by", {}).get("value"),
                # year가 없을 수도 있으니 방어코드 (원래처럼 int()만 쓰고 싶으면 아래 두 줄을 d["year"]로 바꿔도 됨)
                "year": int(d["year"]) if d.get("year") not in (None, "") else None,
            }
            for d in all_articles if d.get("title") != ""
        ],
        "total_citation_count": first_data["cited_by"]["table"][0]["citations"]["all"],
        "since_2020_citation_count": first_data["cited_by"]["table"][0]["citations"]["since_2020"],
        "h_index": first_data["cited_by"]["table"][1]["h_index"]["all"],
    }
    detail_profiles.append(input_data)

    # co_authors도 첫 페이지 기준으로만 읽음
    co_authors = first_data.get("co_authors", [])
    if co_authors:
        for co_author in co_authors:
            author_input_data = {
                "author_id": co_author.get("author_id"),
                "author_names": co_author.get("name"),
                "affiliations": co_author.get("affiliations"),
            }
            extracted_authors.append(author_input_data)

    # -------- 4) 주기적으로 csv 저장 --------
    if aidx % 50 == 49:
        save_data(aidx, detail_profiles, extracted_authors)

save_data(aidx, detail_profiles, extracted_authors)


## Google 검색으로 Google scholar profile 찾기

In [None]:
import requests
import pandas as pd
from tqdm import tqdm

api_key = ""
url = "https://api.scrapingdog.com/google"

unis = {
    'korea.ac.kr': 1, 'yonsei': 1, 'snu': 2, 'kaist': 2, 'postech': 1
}
keywords = ["deep", "machine", "artificial intelligence", "computer"] # 각 3000명씩

datas = []
for uni in unis:
    for keyword in keywords:
        for idx in tqdm(range(1, int(300 * unis[uni]))):
            params = {
                "api_key": api_key,
                "query": f"{uni} {keyword} site:https://scholar.google.com/citations",
                "country": "kr",
                "page": f"{idx}",
                "advance_search": "true",
                "domain": "google.com"
            }

            response = requests.get(url, params=params)

            if response.status_code == 200:
                data = response.json()
            else:
                print(f"[{idx}] - Request failed with status code: {response.status_code}")

            try:
                for d in data['organic_results']:
                    name = d['title']
                    link = d['link']
                    author_id = link.split('/citations?user=')[1].split('&')[0]
                    snippet = d['snippet']

                    input_data = {
                        'name': name,
                        'author_id': author_id,
                        'snippet': snippet,
                        'link': link,
                        'source': d['source']
                    }
                    datas.append(input_data)
                
                if idx == 1:
                    print(f"\nStart for total_results: {data['search_information']['total_results']}\n\n")
                if data['search_information']['total_results'] < idx*10:
                    print(f"\n\nEnd for total_results: {data['search_information']['total_results']}\n\n")
                    break
            except Exception as e:
                print(f"Error: {e}")
                continue

        pd.DataFrame(datas).to_csv('scholar_search_results.csv', index=False)

In [21]:
import requests
import pandas as pd
from tqdm import tqdm

api_key = ""
url = "https://api.scrapingdog.com/google"

idx = 1
uni = "MIT"
keyword = "deep"

params = {
    "api_key": api_key,
    "query": f"{uni} {keyword} site:https://scholar.google.com/citations",
    "country": "kr",
    "page": f"{idx}",
    "advance_search": "true",
    "domain": "google.com"
}

response = requests.get(url, params=params)

if response.status_code == 200:
    data = response.json()
else:
    print(f"[{idx}] - Request failed with status code: {response.status_code}")

try:
    for d in data['organic_results']:
        name = d['title']
        link = d['link']
        author_id = link.split('/citations?user=')[1].split('&')[0]
        snippet = d['snippet']

        input_data = {
            'name': name,
            'author_id': author_id,
            'snippet': snippet,
            'link': link,
            'source': d['source']
        }
        datas.append(input_data)
    
    if idx == 1:
        print(f"\nStart for total_results: {data['search_information']['total_results']}\n\n")
    if data['search_information']['total_results'] < idx*10:
        print(f"\n\nEnd for total_results: {data['search_information']['total_results']}\n\n")
        break
except Exception as e:
    print(f"Error: {e}")
    continue

pd.DataFrame(datas).to_csv('scholar_search_results.csv', index=False)

SyntaxError: 'break' outside loop (1060610328.py, line 48)