In [10]:
import pandas as pd

df1 = pd.read_csv("../files/output (1).csv")
df2 = pd.read_csv("../files/output_backup.csv")
df3 = pd.read_csv("../files/output_onlydog.csv")
merged = pd.concat([df1, df2, df3])
print(len(merged))

merged['author_id'] = merged['author_id'].apply(lambda x: x.split('/citations?user=')[1] if '/citations?user=' in x else x)

merged = merged.drop_duplicates(subset=['title', 'author_id'], keep='first')
print(len(merged))

555798
217248


In [18]:
import pandas as pd
from collections import defaultdict
import json

coauthors = defaultdict(set)

# Step 1: group by title
for title, group in merged.groupby("title"):
    authors = group["author_id"].tolist()
    
    # 단독 저자라도 키를 먼저 만들어준다
    for a in authors:
        coauthors[a]  # touch key
    
    # Step 2: add coauthors
    n = len(authors)
    for i in range(n):
        for j in range(i + 1, n):
            a, b = authors[i], authors[j]
            coauthors[a].add(b)
            coauthors[b].add(a)

# Step 3: convert sets to lists
coauthors = {k: sorted(list(v)) for k, v in coauthors.items()}

with open("coauthors.json", "w") as f:
    json.dump(coauthors, f)


### Get One Scholar Profile

In [None]:
import requests
import pandas as pd
import json
import argparse
from tqdm import tqdm

api_key = ""
url = "https://api.scrapingdog.com/google_scholar/author"

df = pd.read_csv("scholar_from_google_deep_learning_korean.csv")
author_ids = df['author_id'].tolist()

detail_profiles = []
extracted_authors = []

for aidx, author_id in tqdm(enumerate(author_ids[5:8])):
    params = {
        "api_key": api_key,
        "author_id": author_id,
        "page": 0,
        "sort": "pubdate"
    }

    response = requests.get(url, params=params)

    if response.status_code != 200:
        print(f"[author {author_id}] Request failed with status code: {response.status_code}")
        continue

    data = response.json()
    first_data = data  # co_authors, cited_by, author 정보는 첫 페이지 기준으로 사용
    all_articles = data.get("articles", []).copy()

    if len(data.get("articles", [])) == 20:
        for page in range(20, 41, 20):
            params = {
                "api_key": api_key,
                "author_id": author_id,
                "page": page,
                "sort": "pubdate"
            }

            response = requests.get(url, params=params)
            if response.status_code != 200:
                print(f"[author {author_id}] (page {page}) Request failed with status code: {response.status_code}")
                break

            page_data = response.json()
            page_articles = page_data.get("articles", [])

            if not page_articles:
                break

            all_articles.extend(page_articles)

            if len(page_articles) < 20:
                break

    author = first_data["author"]

    input_data = {
        "author_id": author_id,  # 원래 코드에 "author_id" 문자열이 들어가 있던 부분 수정
        "name": author.get("name"),
        "affiliations": author.get("affiliations"),
        "email": author.get("email"),
        "interests": [a["title"] for a in author.get("interests", [])],
        "image_thumbnail": author.get("thumbnail"),
        "articles": [
            {
                "title": d.get("title"),
                "citation_id": d.get("citation_id"),
                "publication": d.get("publication"),
                "citation_count": d.get("cited_by", {}).get("value"),
                "year": int(d["year"]) if d.get("year") not in (None, "") else None,
            }
            for d in all_articles if d.get("title") != ""
        ],
        "total_citation_count": first_data["cited_by"]["table"][0]["citations"]["all"],
        "since_2020_citation_count": first_data["cited_by"]["table"][0]["citations"]["since_2020"],
        "h_index": first_data["cited_by"]["table"][1]["h_index"]["all"],
    }
    detail_profiles.append(input_data)

    # co_authors도 첫 페이지 기준으로만 읽음
    co_authors = first_data.get("co_authors", [])
    if co_authors:
        for co_author in co_authors:
            author_input_data = {
                "author_id": co_author.get("author_id"),
                "author_names": co_author.get("name"),
                "affiliations": co_author.get("affiliations"),
            }
            extracted_authors.append(author_input_data)
    print(input_data)

1it [00:06,  6.96s/it]

{'author_id': 'Tj996iwAAAAJ', 'name': 'Moon Il Kim', 'affiliations': 'Gachon University', 'email': 'Verified email at gachon.ac.kr', 'interests': ['Nanobiotechnology', 'Biosensor', 'Nanozyme'], 'image_thumbnail': 'https://scholar.google.comhttps://scholar.googleusercontent.com/citations?view_op=view_photo&user=Tj996iwAAAAJ&citpid=1', 'articles': [{'title': 'A colorimetric strategy for quantifying amino acids using E. coli auxotrophs displaying gold-binding proteins', 'citation_id': 'Tj996iwAAAAJ:t7zJ5fGR-2UC', 'publication': 'Biosensors and Bioelectronics, 118182, 2025', 'citation_count': '', 'year': 2025}, {'title': 'Hybrid nanoflower-incorporated foldable paper biosensor for colorimetric detection of sodium benzoate based on inhibition of D-amino acid oxidase', 'citation_id': 'Tj996iwAAAAJ:z_wVstp3MssC', 'publication': 'Microchemical Journal, 115979, 2025', 'citation_count': '', 'year': 2025}, {'title': 'Bridging the barrier: insights into blood biomarkers and therapeutic strategies 

2it [00:12,  6.21s/it]

{'author_id': 'tYy-bzgAAAAJ', 'name': 'Qiao Jin, MD', 'affiliations': 'National Institutes of Health', 'email': 'Verified email at nih.gov - Homepage', 'interests': ['Information Retrieval', 'Language Modeling', 'Biomedical Informatics', 'Medical AI'], 'image_thumbnail': 'https://scholar.google.comhttps://scholar.googleusercontent.com/citations?view_op=view_photo&user=tYy-bzgAAAAJ&citpid=10', 'articles': [{'title': 'AgentMD: Empowering language agents for risk prediction with large-scale clinical tool learning', 'citation_id': 'tYy-bzgAAAAJ:j3f4tGmQtD8C', 'publication': 'Nature Communications, 2025', 'citation_count': '32', 'year': 2025}, {'title': 'Adversarial prompt and fine-tuning attacks threaten medical large language models', 'citation_id': 'tYy-bzgAAAAJ:evX43VCCuoAC', 'publication': 'Nature Communications, 2025', 'citation_count': '13*', 'year': 2025}, {'title': 'A foundation model for human-ai collaboration in medical literature mining', 'citation_id': 'tYy-bzgAAAAJ:2KloaMYe4IU

3it [00:20,  6.69s/it]

{'author_id': 'JI12R3IAAAAJ', 'name': 'Eun-Song Lee', 'affiliations': 'Yonsei University College of Dentistry', 'email': 'Verified email at yuhs.ac', 'interests': ['Dentistry'], 'image_thumbnail': 'https://scholar.google.comhttps://scholar.googleusercontent.com/citations?view_op=view_photo&user=JI12R3IAAAAJ&citpid=3', 'articles': [{'title': 'Diagnostic accuracy of quantitative light-induced fluorescence in detecting caries of various types and locations: a systematic review and meta-analysis', 'citation_id': 'JI12R3IAAAAJ:g5m5HwL7SMYC', 'publication': 'Scientific Reports 15 (1), 39905, 2025', 'citation_count': '', 'year': 2025}, {'title': 'Biofluorescence imaging as a valid alternative for dental calculus detection Biofluorescence imaging as a valid alternative for dental calculus detection', 'citation_id': 'JI12R3IAAAAJ:2P1L_qKh6hAC', 'publication': 'Photodiagnosis and Photodynamic Therapy, 104738, 2025', 'citation_count': '', 'year': 2025}, {'title': 'Impact of denture use on mastica


