#### Step 1 导入相关包

In [1]:
import pandas as pd 
import requests
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import openpyxl
import string
import pickle

#### Step 2 加载数据

In [2]:
with open('rest_paper_id.pickle','rb') as file:
    rest_paper_id = pickle.load(file)

In [3]:
rest_paper_id

['2020.acl-demos.1',
 '2020.acl-demos.10',
 '2020.acl-demos.11',
 '2020.acl-demos.12',
 '2020.acl-demos.13',
 '2020.acl-demos.14',
 '2020.acl-demos.15',
 '2020.acl-demos.16',
 '2020.acl-demos.17',
 '2020.acl-demos.18',
 '2020.acl-demos.19',
 '2020.acl-demos.2',
 '2020.acl-demos.20',
 '2020.acl-demos.21',
 '2020.acl-demos.22',
 '2020.acl-demos.23',
 '2020.acl-demos.24',
 '2020.acl-demos.25',
 '2020.acl-demos.26',
 '2020.acl-demos.27',
 '2020.acl-demos.28',
 '2020.acl-demos.29',
 '2020.acl-demos.3',
 '2020.acl-demos.30',
 '2020.acl-demos.31',
 '2020.acl-demos.32',
 '2020.acl-demos.33',
 '2020.acl-demos.34',
 '2020.acl-demos.35',
 '2020.acl-demos.36',
 '2020.acl-demos.37',
 '2020.acl-demos.38',
 '2020.acl-demos.39',
 '2020.acl-demos.4',
 '2020.acl-demos.40',
 '2020.acl-demos.41',
 '2020.acl-demos.42',
 '2020.acl-demos.43',
 '2020.acl-demos.5',
 '2020.acl-demos.6',
 '2020.acl-demos.7',
 '2020.acl-demos.8',
 '2020.acl-demos.9',
 '2020.acl-main.563',
 '2020.acl-main.69',
 '2020.acl-srw.1',
 

#### Step 3 初步检索

In [7]:
def get_reference_paper_id(paper_id):

    reference_lists = []
    base_url = f"https://api.openalex.org/works/https://doi.org/10.18653/v1/{paper_id}"

    response = requests.get(base_url)
    response_json = response.json()
    try:
        reference_lists = response_json['referenced_works']
    except Exception as e:
        print(f'reference出错了{e}')

    return reference_lists

In [14]:
def get_paper_instution_author(reference_url_id):
    author_lists = []
    institution_lists = []
    type_lists = []
    base_url = f"https://api.openalex.org/{reference_url_id}" + "&mailto=1146904101@qq.com"

    response = requests.get(base_url)
    response_json = response.json()
    title = response_json['title']
    try:
        for item in response_json['authorships']:
            author_lists.append(item['author']['display_name'])

            if item['institutions'] == []:
                publication_year = response_json['publication_year']
                author_id = item['author']['id'].split('/')[-1]
                author_url = f"https://api.openalex.org/people/{author_id}"
                response = requests.get(author_url)
                for affi_list in response.json()["affiliations"]:
                    if publication_year in affi_list['years']:

                        institution_lists.append(affi_list['institution']['display_name'])
                        type_lists.append(affi_list['institution']['type'])
                        break
                # 使用最早的机构
                else:
                    try:
                        affi_list_json = response.json()["affiliations"][-1]
                        institution_lists.append(affi_list_json['institution']['display_name'])
                        type_lists.append(affi_list_json['institution']['type'])
                    except:
                        continue
            else:

                institution_lists.append(item['institutions'][0]['display_name'])
                type_lists.append(item['institutions'][0]['type'])
    except Exception as e:
        print(f'institution出错了{e}')

    return title, author_lists, institution_lists, type_lists

#### Step 4 写入Excel

In [15]:
import openpyxl
from tqdm import tqdm

# 创建一个新的 Excel 文件
output_wb = openpyxl.Workbook()
output_ws = output_wb.active

# 写入表头
output_ws.append(['Paper_ID', 'Title', 'Author_Name', 'Institution_Name', 'Institution_Type'])

# 初始化计数器
counter = 0

# 在 tqdm 循环中处理每一行，并将结果逐行写入新的 Excel 文件
for paper_id in tqdm(rest_paper_id):

    reference_lists = get_reference_paper_id(paper_id)
    for reference_id in reference_lists:
        title, author_lists, institution_lists, type_lists = get_paper_instution_author(reference_id)

        output_ws.append([paper_id, title, str(author_lists), str(institution_lists), str(type_lists)])

        # 更新计数器
        counter += 1

        # 如果计数器达到5000的倍数，保存文件并重置计数器
        if counter % 50 == 0:
            output_wb.save('12345.xlsx')
            print('保存了一个')

# 最后保存文件
output_wb.save('12345.xlsx')

  0%|          | 6/3161 [01:50<17:18:16, 19.75s/it]

保存了一个


  0%|          | 6/3161 [02:37<23:01:00, 26.26s/it]


KeyboardInterrupt: 

In [7]:
url = 'https://aclanthology.org/search/?q=https://aclanthology.org/A00-3001'
response = requests.get(url)

In [8]:
response.encoding = 'utf-8'
response.text

'<!doctype html><html lang=en-us><head><meta charset=utf-8><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><!--[if IEMobile]><meta http-equiv=cleartype content="on"><![endif]--><title>Search Results - ACL Anthology</title><meta name=generator content="Hugo 0.118.2"><link href=/aclicon.ico rel="shortcut icon" type=image/x-icon><link rel=stylesheet href=/css/main.min.b1d14a9a8f6bb9c608ca4de9aad72a6e06945119f97951f2908522dc220e6277.css media=screen><link rel=stylesheet href=https://use.fontawesome.com/releases/v5.7.2/css/all.css integrity=sha384-fnmOCqbTlWIlj8LyTjo7mOUStjsKC4pOpQbqyi7RrhN7udi9RwhKkMHpvLbHG9Sr crossorigin=anonymous><link rel=stylesheet href=/css/academicons.min.css><meta name=robots content="noindex, nofollow"></head><body><nav class="navbar navbar-expand-sm navbar-light bg-light bg-gradient-light shadow-sm py-0 mb-3 mb-md-4 mb-xl-5"><div id=navbar-container class=container><a class=navbar-brand href=/><img src=/images/