In [None]:
import pandas as pd
import os
import glob

prefix = "scholar_search_results"
pattern = f"{prefix}*.csv"
csv_files = glob.glob(pattern)

df4 = pd.read_csv('scholar_total.csv')
dfs = [df4]
for path in csv_files:
    try:
        df = pd.read_csv(path)
        dfs.append(df)
    except Exception as e:
        print(e, path)

merged = pd.concat(dfs, ignore_index=True)
print(len(merged))
clean = merged.drop_duplicates(subset=["author_id"], keep='first')
print(len(clean))

clean.to_csv("scholar_from_google.csv", index=False)

In [None]:
# 딥러닝 토픽이 논문 제목에 포함된 것만 남기기
deep_learning_keywords = [
    'llm',
    'agent',
    'large model',
    'multimodal',
    'adversarial network',
    'generation',
    
    # Core DL fields
    "deep learning",
    'machine learning',
    "neural network",
    "transformer",
    "attention",
    "autoregressive",
    "self-supervised",
    "contrastive learning",
    "representation learning",
    "data-driven",
    "latent space",
    "resnet",
    "convnext",
    "encoder",
    "decoder",
    "autoencoder",
    "codec",
    "nerf",
    "gaussian",
    "splatting",
    "neural",
    "super resolution",

    # Architectures
    "cnn",
    "lstm",
    "unet",
    "vit",
    "bert",
    "gpt",
    "vae",
    "gan",
    "vq-vae",

    # Generative models
    "diffusion",
    "diffusion model",
    "score-based",
    "flow matching",
    "normalizing flow",
    "generative model",
    
    # Multimodal
    "multimodal",
    "vision-language",
    "audio-language",
    "clip model",
    "large language model",

    # Audio / speech
    "tts",
    "speech recognition",
    "audio generation",
    "music generation",

    "backpropagation",
    "gradient descent",
    "finetuning",
    "fine-tuning",
    "pretraining",
    "pre-training",
    "zero-shot",
    "zeroshot",
    "few-shot",
    "fewshot",
    "kl divergence",
    "cross entropy",

    "reinforcement learning",
    "policy gradient",
    "ppo",
    "grpo",
    "rlhf",
    "human feedback",
    "reward model",
    "graph",
    "feedforward",
    "feed-forward",
    "backward",
    "back-ward",
    "representation",
    "training ",
    "hyperparameter",
    "hyper-parameter",
    "supervised learning",

    "retrieval",
    "quantization",
    "vlm ",
    "sllm",

    "computer",
    "algorithm",

    "system",
    "efficient",
    "efficient",
]

surnames = [
    "Kang", "Ko", "Gwak", "Gu", "Guk", "Kwon", "Keum", "Ki",
    "Na", "Nam", "Namgung", "Noh", "Non", "Dan", "Dam", "Dang",
    "Do", "Dokgo", "Dongbang", "Dong", "Du", "Ra", "Ryeong",
    "Ryu", "Ryuk", "Ri",
    "Ma", "Man", "Maeng", "Myeong", "Mo", "Mok", "Muk", "Moon",
    "Min", "Park", "Ban", "Bang", "Bae", "Baek", "Beom", "Byun",
    "Bok", "Bong", "Boo", "Bi", "Bin",
    "Sa", "Sam", "Sang", "Seo", "Seomun", "Seon", "Seonwoo", "Sung",
    "So", "Son", "Song", "Su", "Seung", "Si", "Shin", "Sim",
    "A", "Ahn", "Ae", "Yang", "Eo", "Eom", "Yeo", "Yeon", "Yeom",
    "Young", "Ye", "Oh", "Ok", "On", "Ong", "Wang", "Yo", "Yong",
    "Woo", "Won", "Wi", "Yu", "Yuk", "Yoon", "Eun", "Eum",
    "Lee", 
    "In", "Lim", "Jang", "Jeon", "Jeol", "Jung", "Je", "Jegal",
    "Jo", "Jwa", "Joo", "Juk", "Jun", "Ji", "Jin",
    "Cha", "Chae", "Cheo", "Cheon", "Cho", "Choi",
    "Chu", "Tak", "Tan", "Tang", "Tae",
    "Ha", "Hak", "Han", "Ham", "Heo", "Hyun", "Hyeong", "Ho",
    "Hong", "Hwa", "Hwang", "Hwangbo", "Hu", "Heung"
]
surnames = [
    "Kang", "Kwon", "Ko"
    "Park", "Bang", "Bae", "Baek",
    "Yong",
    "Woo", "Won", "Yoon", "Eun", "Eum",
    "Lee", 
    "In", "Lim", "Jang", "Jeon", "Jung",
    "Jo", "Jin",
    "Cha", "Cheon", "Cho", "Choi",
    "Ha", "Hak", "Han", "Ham", "Heo",
    "Kim",
    "Hong", "Hwang",
    "Shin", "Song", "Seo",
    "Kwak", "Park", "Ryu", "Soh", "Roh", "Im", "Ahn", "Koh",
    "Nam", "Oh", "Huh", "Son"
]

import pandas as pd

# -----------------------------------------------------------
# 0) Load data
# -----------------------------------------------------------
from_scholar = pd.read_csv("scholar_from_scholar_search_total.csv")

print("\n=== BASIC INFO ===")
print(f"Total rows in from_scholar: {len(from_scholar):,}")

# -----------------------------------------------------------
# 1) Unique authors
# -----------------------------------------------------------
author_unique = from_scholar.drop_duplicates(subset=["author_id"], keep="first")
print(f"Unique authors: {len(author_unique):,}")

# -----------------------------------------------------------
# 2) Deep Learning 관련 논문 필터링
#    - deep_learning_keywords: 미리 정의되어 있다고 가정
# -----------------------------------------------------------
pattern_dl = "|".join(deep_learning_keywords)

# title에서 DL 키워드 포함된 논문만
df_dl = from_scholar[
    from_scholar["title"]
    .fillna("")
    .str.lower()
    .str.contains(pattern_dl.lower(), regex=True, na=False)
]

# 딥러닝 논문을 쓴 author_id 기준 unique
df_dl_unique = df_dl.drop_duplicates(subset=["author_id"], keep="first")
print("\n=== DEEP LEARNING FILTER ===")
print(f"Authors with DL-related papers : {len(df_dl_unique):,}")

# -----------------------------------------------------------
# 3) '논문이 1개뿐인 사람' 구하기 (딥러닝 여부 상관 X)
# -----------------------------------------------------------
paper_counts = from_scholar.groupby("author_id").size()
one_paper_ids = set(paper_counts[paper_counts == 1].index)
print(f"Authors with exactly 1 paper   : {len(one_paper_ids):,}")

# -----------------------------------------------------------
# 4) '딥러닝 논문 있거나(or) 논문이 1개뿐인 사람' 집합 만들기
# -----------------------------------------------------------
dl_ids = set(df_dl_unique["author_id"])
target_ids = dl_ids.union(one_paper_ids)

print("\n=== TARGET AUTHORS (DL OR 1 PAPER) ===")
print(f"DL authors (>=1 DL paper)      : {len(dl_ids):,}")
print(f"Union(DL authors, 1-paper)     : {len(target_ids):,}")

# 이 target_ids에 해당하는 author들의 대표 row 하나씩만 유지
df_target_authors = (
    from_scholar[from_scholar["author_id"].isin(target_ids)]
    .drop_duplicates(subset=["author_id"], keep="first")
    .copy()
)
print(f"Target author rows (unique)    : {len(df_target_authors):,}")

# -----------------------------------------------------------
# 5) Korean surnames 필터링
#    - surnames: 한국 성 리스트 (예: ["kim", "park", "lee", ...])
#    - " kim", " park" 처럼 공백 포함해서 패턴 생성
# -----------------------------------------------------------
pattern_kr = "|".join([" " + s.lower() for s in surnames])

df_target_authors["author_lower"] = df_target_authors["author_names"].fillna("").str.lower()

df_korean_final = df_target_authors[
    df_target_authors["author_lower"].str.contains(pattern_kr, regex=True, na=False)
].copy()

print("\n=== KOREAN AUTHORS (DL OR 1 PAPER) ===")
print(f"Korean DL/1-paper authors      : {len(df_korean_final):,}")

# -----------------------------------------------------------
# 6) SUMMARY
# -----------------------------------------------------------
print("\n=== SUMMARY ===")
print(f"Total rows in original CSV     : {len(from_scholar):,}")
print(f"Total unique authors           : {len(author_unique):,}")
print(f"DL authors (>=1 DL paper)      : {len(df_dl_unique):,}")
print(f"Authors with exactly 1 paper   : {len(one_paper_ids):,}")
print(f"Target authors (DL or 1 paper) : {len(df_target_authors):,}")
print(f"Korean target authors          : {len(df_korean_final):,}\n")



In [None]:
paper_counts = df_dl.groupby('author_id').size()

def bucket_paper_count(n):
    if n == 1:
        return "1 paper"
    elif n == 2:
        return "2 papers"
    elif n == 3:
        return "3 papers"
    elif 4 <= n <= 5:
        return "4-5 papers"
    else:
        return "6+ papers"
buckets = paper_counts.map(bucket_paper_count)
import pandas as pd

bucket_counts = buckets.value_counts()

# 보기 좋게 순서 고정
bucket_order = ["1 paper", "2 papers", "3 papers", "4-5 papers", "6+ papers"]
bucket_counts = bucket_counts.reindex(bucket_order, fill_value=0)

total_authors = len(paper_counts)

summary = pd.DataFrame({
    "count": bucket_counts,
    "ratio": (bucket_counts / total_authors).round(4)  # 비율
})

print(summary)


In [None]:
from tqdm import tqdm
datas = []

from_google = pd.read_csv('scholar_from_google.csv')

for i in tqdm(range(len(from_scholar_deep_learning_korean))):
    d = from_scholar_deep_learning_korean.iloc[i]
    datas.append({
        'name': d['author_names'],
        'author_id': d['author_id'],
    })

for i in tqdm(range(len(from_google))):
    d = from_google.iloc[i]
    datas.append({
        'name': d['name'],
        'author_id': d['author_id'],
    })
print(len(datas))

newdf = pd.DataFrame(datas)
newdf = newdf.drop_duplicates(subset=['author_id'], keep='first')
print(len(newdf))

In [None]:
df_korean_final.to_csv("deep_or_one_korean.csv", index=False)

### make coauthor list

In [None]:
import pandas as pd
import json

current_coauthor_map = json.load(open('coauthors.json'))
print(len(current_coauthor_map))
extracted_authors = pd.read_csv("most_extracted_authors.csv")

for i in range(len(extracted_authors)):
    d = extracted_authors.iloc[i]
    
    # from_author_id가 NaN이면 skip
    if pd.isna(d['from_author_id']):
        continue
    
    a = str(d['author_id'])
    b = str(d['from_author_id'])
    
    # 딕셔너리에 키 없으면 먼저 만들어주기
    if a not in current_coauthor_map:
        current_coauthor_map[a] = []
    if b not in current_coauthor_map:
        current_coauthor_map[b] = []

    # 양방향 추가 (중복 방지)
    if b not in current_coauthor_map[a]:
        current_coauthor_map[a].append(b)
    if a not in current_coauthor_map[b]:
        current_coauthor_map[b].append(a)

# Save
print(len(current_coauthor_map))
json.dump(current_coauthor_map, open('coauthors.json', 'w'))


### After get detail profiles

In [None]:
import pandas as pd
import os
import glob

prefix = "./middle_files/detail_profiles"
pattern = f"{prefix}*.csv"
csv_files = glob.glob(pattern)

dfs = []
for path in csv_files:
    try:
        df = pd.read_csv(path)
        dfs.append(df)
    except Exception as e:
        print(e, path)

merged = pd.concat(dfs, ignore_index=True)
print(len(merged))

merged.to_csv("1212_scholar_detail_profiles.csv", index=False)

In [None]:
import pandas as pd

df1 = pd.read_csv("1212_scholar_detail_profiles.csv")
df2 = pd.read_csv("most_recent_total_detail_profiles.csv")

df = pd.concat([df1, df2], ignore_index=True)
print(len(df))

df = df.drop_duplicates(subset=['author_id'], keep='first')
print(len(df))

df.to_csv("most_recent_total_detail_profiles.csv", index=False)



In [None]:
import pandas as pd
import os
import glob

prefix = "./middle_files/extracted_authors"
pattern = f"{prefix}*.csv"
csv_files = glob.glob(pattern)

dfs = []
for path in csv_files:
    try:
        df = pd.read_csv(path)
        dfs.append(df)
    except Exception as e:
        print(e, path)

merged = pd.concat(dfs, ignore_index=True)
print(len(merged))

merged.to_csv("1212_extracted_authors.csv", index=False)

In [None]:
interests = set([])

for intersts_list in df_filtered_not['interests_list']:
    for interest in intersts_list:
        interests.add(interest)

interests

from collections import Counter

counter = Counter()

for interests in df['interests_list']:
    text = " ".join(interests).lower()
    for kw in dl_keywords:
        if kw in text:
            counter[kw] += 1

keyword_counts = dict(sorted(counter.items(), key=lambda x: x[1], reverse=True))


In [None]:
dl_interests = [
    ""
    # Core AI / ML
    "artificial intelligence", "machine learning", "deep learning", "neural networks",
    "representation learning", "computational intelligence", "statistical learning",
    "pattern recognition", "data mining", "optimization", "reinforcement learning",
    "probabilistic modeling", "generative models", "foundation models",

    # Deep Learning Subfields
    "computer vision", "image processing", "image recognition", "object detection",
    "semantic segmentation", "instance segmentation", "image classification",
    "video understanding", "video analysis", "3d vision", "scene understanding",
    "medical image analysis", "computational photography", "robot perception",

    # NLP / Speech
    "natural language processing", "nlp", "text mining", "question answering",
    "sentiment analysis", "language modeling", "speech recognition",
    "speech processing", "speech synthesis", "tts", "asr", "dialog systems",
    "multimodal learning",

    # Audio / Signal
    "audio processing", "audio signal processing", "music information retrieval",
    "sound event detection",

    # Model Architecture / Optimization
    "transformers", "attention", "graph neural networks", "gnn",
    "convolutional neural networks", "cnn", "rnn", "lstm", "vae", "gan",
    "diffusion models", "flow matching", "autoregressive models",

    # Robotics / Control
    "robotics", "robot control", "robot learning", "autonomous systems",

    # Recommendation / Info Retrieval
    "information retrieval", "recommender systems", "recommendation systems",
    "knowledge graphs",

    # Other ML Areas
    "meta-learning", "federated learning", "domain adaptation",
    "transfer learning", "self-supervised learning", "semi-supervised learning",
    "unsupervised learning", "online learning", "active learning",

    # Applied ML
    "medical ai", "bioinformatics", "healthcare ai", "financial ai",
    "computational biology", "autonomous driving", "human-computer interaction",
    "llm", "llms", "large language models", "large language model", 'diffusion', 'generative model',

    # Robustness / Safety / Explainability
    "robustness", "interpretability", "explainable ai", "adversarial machine learning",
    "fairness", "ai safety",

    # Training / Optimization Topics
    "loss functions", "sampling", "regularization", "bayesian optimization",
    "stochastic optimization", "large scale learning",

    # Agents
    "agent", "multi-agent systems", "autonomous agents", "AI ", " AI", "reinforcement", "supervised", "rlhf", "ppo", 'finetuning', '3d', 'nerf', 'video', "image",

    'ML',
    'Multimodal Understanding',
    'Adversarial attacks',
    'Anomaly detection',
    'Tensor Mining',
    'AI-Embedded Software-on-Chip Lab',
    'Inference Serving',
    'neural network',
    'AI/ML/DL',
    'Automated Reasoning',
    'speech signal processing',
    'XAI',
    'Neural computation',
    'learning model',
    'Sim2Real',
    'Multimodal Discourse Analysis',
    'Action Recognition',
    'Neural Engineeing',        # == Neural Engineering
    'User Interface and Interaction',  # borderline, but UI+AI 연구 가능
    'computational methods',    # broad, but usually ML-heavy
    'data-driven analysis',
    'Neural coding',
    'Network Neuroscience',     # computational modeling domain
    'Neural interfaces',        # sometimes ML-driven
    'systems and computational neuroscience',
    'Causal inference',         # ML/AI research area
    'Computer security',        # ML subfield when tied to adversarial ML
    'Time Series Forcasting',   # (typo: Forecasting) common ML area
    'Gradient boosting',        # ML technique
    'Deep learining',           # typo → deep learning
    'Vision Language Models',
    'Multimodal Discourse Analysis',
    'Multiphysics modeling',    # borderline
    'Inference Serving',
    'artificial neural network',
    'artificial general intelligence',
    'computation',
    'computer'
]
dl_keywords = [kw.lower() for kw in dl_interests]

def has_dl_interest_partial(interests):
    text = " ".join(interests).lower()  # 하나의 문자열처럼 붙여서 체크
    return any(kw in text for kw in dl_keywords)

import ast

df = pd.read_csv("most_recent_total_detail_profiles.csv")
print(len(df))

df['interests_list'] = df['interests'].apply(lambda x: ast.literal_eval(x))
df_deeplearning_related = df[df['interests_list'].apply(has_dl_interest_partial)]
print(len(df_deeplearning_related))

In [None]:
surnames = [
    "Kang", "Kwon", "Ko"
    "Park", "Bang", "Bae", "Baek",
    "Yong",
    "Woo", "Won", "Yoon", "Eun", "Eum",
    "Lee", 
    "In", "Lim", "Jang", "Jeon", "Jung",
    "Jo", "Jin",
    "Cha", "Cheon", "Cho", "Choi",
    "Ha", "Hak", "Han", "Ham", "Heo",
    "Kim",
    "Hong", "Hwang",
    "Shin", "Song", "Seo",
    "Kwak", "Park", "Ryu", "Soh", "Roh", "Im", "Ahn", "Koh",
    "Nam", "Oh", "Huh", "Son"
]
pattern = "|".join([' ' + s.lower() for s in surnames])

coauthors = pd.read_csv("1211_extracted_authors.csv")

# remains = coauthors.drop_duplicates(subset=['author_id'], keep='last')
korean = coauthors[coauthors["author_names"].str.lower().str.contains(pattern, regex=True)]
print(len(korean))

deeps = set(df['author_id'].tolist())
from_deep_coauthor_korean_ids = set([])

for i in range(len(korean)):
    d = korean.iloc[i]
    if d['from_author_id'] in deeps and d['author_id'] not in deeps:
        from_deep_coauthor_korean_ids.add(d['author_id'])

print(len(from_deep_coauthor_korean_ids))

In [None]:
for i in range(len(df_korean_final)):
    d = df_korean_final.iloc[i]
    if d['author_id'] not in deeps:
        from_deep_coauthor_korean_ids.add(d['author_id'])
print(len(from_deep_coauthor_korean_ids))

datas = []
for sf in from_deep_coauthor_korean_ids:
    datas.append({
        'author_id': sf,
    })

newdf = pd.DataFrame(datas)
newdf.to_csv("from_coauthor_1211.csv", index=False)

### Scraping Citations

In [None]:
import pandas as pd

df = pd.read_csv("most_recent_total_detail_profiles.csv")
print(len(df))

df.head()

In [None]:
import ast
from tqdm import tqdm
from collections import defaultdict

citations_ids = set([])
papers_citations = defaultdict(list)

for i in tqdm(range(len(df_deeplearning_related))):
    d = df_deeplearning_related.iloc[i]
    if pd.isna(d['articles']):
        continue
    
    for c in ast.literal_eval(d['articles']):
        citations_ids.add(f"{c['title'].lower()}_{c['year']}")
        # papers_citations[c['citation_id'].split(":")[1]].append(c['title'])

print(len(citations_ids))

In [None]:
import requests

api_key = ""
url = "https://api.scrapingdog.com/google_scholar/author"
author_id = "E0YZuSgAAAAJ"
citation_id = "E0YZuSgAAAAJ:4OULZ7Gr8RgC"

params = {
    "api_key": api_key,
    "author_id": author_id,
    "citation_id": citation_id,
    "view_op": "view_citation"
}

response = requests.get(url, params=params)

if response.status_code == 200:
    data = response.json()
    print(data)
else:
    print(f"Request failed with status code: {response.status_code}")

data = {
    "author_id" : author_id,
    "citation_id" : citation_id,
    "title": data['title'],
    "link": data['link'],
    "description": data['description'],
    "publication_date": data['publication_date']
}

In [None]:
!python3 -m pip install apify-client

In [None]:
from apify_client import ApifyClient

# Initialize the ApifyClient with your API token
client = ApifyClient("")

# Prepare the Actor input
run_input = { "profileUrls": [
        "",
        ""
    ] }

# Run the Actor and wait for it to finish
run = client.actor("2SyF0bVxmgGr8IVCZ").call(run_input=run_input)

# Fetch and print Actor results from the run's dataset (if there are any)
items = []

for item in client.dataset(run["defaultDatasetId"]).iterate_items():
    print(item)
    items.append(item)


In [116]:
import pandas as pd

df = pd.read_csv("most_recent_total_detail_profiles.csv")
print(len(df))
hdf = pd.read_csv("only_home_links.csv")
hdf = hdf.drop_duplicates(subset=['author_id'])
print(len(hdf))

# author_id 기준으로 homepage 컬럼 붙이기
merged = df.merge(
    hdf[["author_id", "home_link"]],
    on="author_id",
    how="left"
)

print(len(merged))

# merged.to_csv(
#     "most_recent_total_detail_profiles_with_homepage.csv",
#     index=False,
#     encoding="utf-8-sig"
# )

40662
40591
40662


In [117]:
len(merged[pd.isna(merged['home_link'])])

23674

In [119]:

merged.to_csv(
    "most_recent_total_detail_profiles_with_homepage.csv",
    index=False,
    encoding="utf-8-sig"
)