In [None]:
import requests, time, json
import pandas as pd

BASE = "https://api.openalex.org/works"

def fetch_works_by_concept(concept_id="C2522767166", per_page=200, max_works=500, delay=0.5):
    works = []
    page = 1
    collected = 0
    while collected < max_works:
        params = {
            "filter": f"concepts.id:{concept_id}",
            "per_page": per_page,
            "page": page
        }
        resp = requests.get(BASE, params=params)
        if resp.status_code != 200:
            print("HTTP error", resp.status_code, resp.text)
            break
        data = resp.json()
        items = data.get("results", [])
        if not items:
            break
        works.extend(items)
        collected += len(items)
        page += 1
        time.sleep(delay)
        if len(items) < per_page:
            break
    return works

def works_to_excel(works, filename="works.xlsx"):
    rows = []
    for w in works:
        work_id = w.get("id")
        title = w.get("title")
        year = w.get("publication_year")
        cited = w.get("cited_by_count")
        
        # Tác giả (lấy tên ghép lại)
        authorships = w.get("authorships", [])
        authors = [a["author"]["display_name"] for a in authorships if "author" in a]
        authors_str = "; ".join(authors)
        
        # Concept (chỉ lấy top-level hoặc top-3)
        concepts = w.get("concepts", [])
        top_concepts = [c["display_name"] for c in concepts[:3]]
        concepts_str = "; ".join(top_concepts)
        
        rows.append({
            "ID": work_id,
            "Title": title,
            "Year": year,
            "Cited_by": cited,
            "Authors": authors_str,
            "Concepts": concepts_str
        })
    
    df = pd.DataFrame(rows)
    df.to_excel(filename, index=False)
    print(f"Saved {len(df)} records to {filename}")

if __name__ == "__main__":
    #concept id = C2522767166
    works = fetch_works_by_concept("C2522767166", max_works=2400)
    works_to_excel(works, "Project_1_dataset.xlsx")

Saved 2400 records to Project_1_dataset.xlsx


In [7]:
import pandas as pd
import networkx as nx
from itertools import combinations

# Đọc file Excel
df = pd.read_excel("Project_1_dataset.xlsx")

# Tạo graph
G = nx.Graph()

for _, row in df.iterrows():
    # Tách tác giả theo dấu ";"
    authors = str(row["Authors"]).split(";")
    authors = [a.strip() for a in authors if a.strip()]  # xóa khoảng trắng thừa
    
    # Thêm node cho mỗi tác giả
    for author in authors:
        G.add_node(author)
    
    # Thêm cạnh đồng tác giả (nếu >=2 tác giả trong 1 bài)
    for a, b in combinations(authors, 2):
        if G.has_edge(a, b):
            G[a][b]["weight"] += 1
        else:
            G.add_edge(a, b, weight=1)

# Kết quả
print("Số nodes (tác giả):", G.number_of_nodes())
print("Số edges (cộng tác):", G.number_of_edges())


Số nodes (tác giả): 9648
Số edges (cộng tác): 136197


In [5]:
nx.write_gexf(G, "Project_1_coauthor_network.gexf")   # mở bằng Gephi
nx.write_edgelist(G, "Project_1_coauthor_edges.csv", delimiter=",")
