#### Step 1 导入相关包

In [1]:
import pandas as pd
import requests
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import openpyxl
import string
import pickle
import json
import os
import time

#### Step 2 加载数据

处理第四个文件

In [6]:
# 打开文件并读取数据
file_index = 4
with open(f'../output/dblp_parsed_{file_index}.json', 'r',encoding='utf-8', errors='ignore') as file:
    pre_data = json.load(file)

In [7]:
len(pre_data)

1000000

In [3]:
if not os.path.exists(f'../openalex_connect_result/dblp_parsed_{file_index}'):
    os.makedirs(f'../openalex_connect_result/dblp_parsed_{file_index}')

#### Step 3 过滤数据

In [4]:
def filter_data(data):
    filtered_data = []
    for item in data:
        if 'ee' in item:
            if isinstance(item['ee'], list):
                for ee in item['ee']:
                    if 'doi' in ee:
                        filtered_data.append(item)
                        break
            elif 'doi' in item['ee']:
                filtered_data.append(item)
        else:
            pass
    return filtered_data

In [5]:
filtered_data = filter_data(pre_data)
print(f'原数据长度： {len(pre_data)}')
print('数据保留率：',len(filtered_data)/len(pre_data))

原数据长度： 1000000
数据保留率： 0.768572


#### Step 4 解析数据

In [6]:
def get_paper_message(url):
    base_url = f"https://api.openalex.org/works/{url}?mailto=1146904171@qq.com"
    # 假设代理服务器地址是 127.0.0.1，端口是 7890
    proxies = {
                "http": "socks5://127.0.0.1:7890",
                "https": "socks5://127.0.0.1:7890",
                }
    try:
        response = requests.get(base_url, timeout=5, proxies=proxies)
        # 检查响应状态码
        if response.status_code == 200:
            try:
                response_json = response.json()
                return response_json
            except Exception as e:
                return None
        elif response.status_code == 404:
            return -1
        else:
            print(f"请求失败，状态码: {response.status_code}")
            return None
    except Exception as e:
        print(f"连接错误: {e}")
        return None

In [7]:
sub_file_index = 1
# 检查已有的文件，找出最大的 sub_file_index
result_dir = f'../openalex_connect_result/dblp_parsed_{file_index}'
existing_files = os.listdir(result_dir)
existing_indexes = []
for file in existing_files:
    if file.startswith('output_') and file.endswith('.json'):
        try:
            index = int(file.split('_')[1].split('.')[0])
            existing_indexes.append(index)
        except ValueError:
            continue
if existing_indexes:
    sub_file_index = max(existing_indexes) + 1

print(f'当前已经处理的文件有： {existing_files}')
print(f'当前的sub_file_index: {sub_file_index}')

当前已经处理的文件有： []
当前的sub_file_index: 1


In [10]:
import random

# 生成 16 个随机 QQ 号列表
mailto_list = []
for _ in range(16):
    qq_number = random.randint(100000, 2999999999)
    mailto_list.append(f"{qq_number}@qq.com")


mailto_index = 0

print(mailto_list)

['519021793@qq.com', '1727255455@qq.com', '1542112286@qq.com', '253223137@qq.com', '2375786039@qq.com', '1875544845@qq.com', '1145630659@qq.com', '2138464596@qq.com', '1024543545@qq.com', '765614139@qq.com', '90828880@qq.com', '2300679545@qq.com', '1652896108@qq.com', '2772069501@qq.com', '613666958@qq.com', '2665194398@qq.com']


#### 批量抓

In [11]:
def batch_get_paper_messages(dois,mailto):
    
    pipe_separated_dois = "|".join(dois)
    base_url = f"https://api.openalex.org/works?filter=doi:{pipe_separated_dois}&per-page=50&mailto={mailto}"
    proxies = {
                "http": "socks5://127.0.0.1:7890",
                "https": "socks5://127.0.0.1:7890",
                }
    try:
        response = requests.get(base_url, timeout=5, proxies=proxies)
        # 检查响应状态码
        if response.status_code == 200:
            try:
                response_json = response.json().get("results", [])
                return response_json
            except Exception as e:
                return None
        else:
            print(f"请求失败，状态码: {response.status_code}")
            return []
    except Exception as e:
        print(f"连接错误: {e}")
        return []

In [12]:
filtered_data[0]

{'type': 'inproceedings',
 'mdate': '2022-04-09',
 'key': 'conf/ecai/0001S20',
 'authors': ['Shakil M. Khan 0001', 'Mikhail Soutchanski'],
 'titles': ['Necessary and Sufficient Conditions for Actual Root Causes.'],
 'pages': ['800-808'],
 'year': ['2020'],
 'booktitle': ['ECAI'],
 'ee': ['https://doi.org/10.3233/FAIA200169'],
 'crossref': ['conf/ecai/2020'],
 'url': ['db/conf/ecai/ecai2020.html#0001S20']}

#### Step 5 开抓数据

In [14]:
mailto_list[mailto_index]

'519021793@qq.com'

In [17]:
start_index = 0
count = 0
linked_data = []

# 从 start_index 开始处理数据
batch_size = 50  # 每批次处理的数量
doi_batch = []
for index, item in enumerate(tqdm(filtered_data)):
    if index < start_index:
        continue
    # 检查 item 中是否包含 'ee' 字段且 'ee' 列表不为空
    if 'ee' in item and len(item['ee']) > 0:
        doi = str(item['ee'][0])
        doi_batch.append(doi)
        if len(doi_batch) >= batch_size:
            mailto = mailto_list[mailto_index]
            paper_infos = batch_get_paper_messages(doi_batch,mailto)
            if paper_infos == []:
                print(f"解析出错")
            else:
                linked_data.extend(paper_infos)
                if len(paper_infos) < 5:
                    print('这次返回地有点少,可能出错了',len(paper_infos))
            doi_batch = []  # 清空 doi_batch 列表
            time.sleep(0.1)
    # 每处理一个元素，计数器加 1
    count += 1
    if count >= 50000:  # 当计数器达到 50000 时, 就保存，设置太大不行，这个colab还是不够稳
        file_name = f'../openalex_connect_result/dblp_parsed_{file_index}/output_{sub_file_index}.json'  # 生成新的文件名
        with open(file_name, 'w') as f:  # 将 linked_data 列表的内容输出到本地文件
            json.dump(linked_data, f)
            print(f'output_{sub_file_index}已成功保存了')
        linked_data = []  # 清空 linked_data 列表
        count = 0  # 计数器重置为 0
        sub_file_index += 1  # 文件索引加 1
        
        # 换个邮箱
        mailto_index = (mailto_index + 1) % len(mailto_list)

  1%|          | 4350/768572 [02:15<7:25:21, 28.60it/s]

解析出错


  1%|          | 4400/768572 [02:16<6:50:52, 31.00it/s]

解析出错


  1%|          | 4450/768572 [02:18<6:30:29, 32.61it/s]

解析出错


  1%|          | 4500/768572 [02:19<6:20:12, 33.49it/s]

解析出错


  1%|          | 4550/768572 [02:20<6:05:55, 34.80it/s]

解析出错


  1%|          | 4600/768572 [02:22<6:01:51, 35.19it/s]

解析出错


  1%|          | 4650/768572 [02:23<5:53:58, 35.97it/s]

解析出错


  1%|          | 4700/768572 [02:24<5:50:47, 36.29it/s]

解析出错


  1%|          | 9400/768572 [05:35<8:00:03, 26.36it/s] 

解析出错


  1%|          | 9450/768572 [05:36<7:17:36, 28.91it/s]

解析出错


  4%|▎         | 27700/768572 [18:11<7:17:22, 28.23it/s] 

这次返回地有点少,可能出错了 3


  4%|▎         | 27750/768572 [18:12<7:08:32, 28.81it/s]

这次返回地有点少,可能出错了 4


  4%|▎         | 27800/768572 [18:14<6:40:55, 30.79it/s]

这次返回地有点少,可能出错了 3


  4%|▎         | 27900/768572 [18:17<6:32:20, 31.46it/s]

这次返回地有点少,可能出错了 4


  4%|▎         | 28450/768572 [18:33<5:42:02, 36.06it/s]

这次返回地有点少,可能出错了 4


  4%|▍         | 28900/768572 [18:47<6:17:07, 32.69it/s]

这次返回地有点少,可能出错了 4


  4%|▍         | 28950/768572 [18:48<6:04:40, 33.80it/s]

这次返回地有点少,可能出错了 3


  4%|▍         | 29000/768572 [18:49<5:54:42, 34.75it/s]

这次返回地有点少,可能出错了 4


  4%|▍         | 29050/768572 [18:51<5:46:55, 35.53it/s]

这次返回地有点少,可能出错了 3


  4%|▍         | 29250/768572 [18:57<5:57:16, 34.49it/s]

这次返回地有点少,可能出错了 4


  4%|▍         | 29300/768572 [18:58<5:42:54, 35.93it/s]

这次返回地有点少,可能出错了 3


  4%|▍         | 29450/768572 [19:02<6:01:13, 34.10it/s]

这次返回地有点少,可能出错了 4


  4%|▍         | 32400/768572 [20:58<6:09:53, 33.17it/s]

这次返回地有点少,可能出错了 3


  6%|▋         | 49950/768572 [33:27<8:39:12, 23.07it/s] 

output_1已成功保存了


  9%|▉         | 72300/768572 [49:39<7:32:26, 25.65it/s] 

解析出错


 10%|█         | 79500/768572 [54:23<6:39:26, 28.75it/s]

解析出错


 10%|█         | 79550/768572 [54:24<6:01:12, 31.79it/s]

解析出错


 10%|█         | 79600/768572 [54:25<5:43:32, 33.42it/s]

解析出错


 10%|█         | 79650/768572 [54:27<5:38:43, 33.90it/s]

解析出错


 10%|█         | 79700/768572 [54:28<5:38:26, 33.92it/s]

解析出错


 10%|█         | 79750/768572 [54:29<5:31:42, 34.61it/s]

解析出错


 10%|█         | 79800/768572 [54:31<5:19:21, 35.95it/s]

解析出错


 10%|█         | 79850/768572 [54:32<5:16:26, 36.27it/s]

解析出错


 10%|█         | 79900/768572 [54:33<5:14:38, 36.48it/s]

解析出错


 10%|█         | 79950/768572 [54:35<5:49:20, 32.85it/s]

解析出错


 10%|█         | 80000/768572 [54:37<5:34:54, 34.27it/s]

解析出错


 10%|█         | 80050/768572 [54:38<5:30:30, 34.72it/s]

解析出错


 10%|█         | 80100/768572 [54:39<5:22:15, 35.61it/s]

解析出错


 10%|█         | 80150/768572 [54:41<5:24:49, 35.32it/s]

解析出错


 10%|█         | 80200/768572 [54:42<5:10:21, 36.97it/s]

解析出错


 10%|█         | 80250/768572 [54:43<5:03:41, 37.78it/s]

解析出错


 10%|█         | 80300/768572 [54:44<5:00:14, 38.21it/s]

解析出错


 10%|█         | 80350/768572 [54:46<4:59:57, 38.24it/s]

解析出错


 10%|█         | 80400/768572 [54:47<4:54:04, 39.00it/s]

解析出错


 10%|█         | 80450/768572 [54:49<5:12:41, 36.68it/s]

解析出错


 10%|█         | 80500/768572 [54:50<5:07:26, 37.30it/s]

解析出错


 10%|█         | 80550/768572 [54:51<5:13:58, 36.52it/s]

解析出错


 12%|█▏        | 92350/768572 [1:02:56<6:43:38, 27.92it/s]

解析出错


 12%|█▏        | 92400/768572 [1:02:58<6:19:12, 29.72it/s]

解析出错


 12%|█▏        | 92450/768572 [1:02:59<5:54:40, 31.77it/s]

解析出错


 12%|█▏        | 92500/768572 [1:03:01<5:43:53, 32.77it/s]

解析出错


 12%|█▏        | 92550/768572 [1:03:02<5:46:59, 32.47it/s]

解析出错


 12%|█▏        | 92600/768572 [1:03:04<5:44:55, 32.66it/s]

解析出错


 12%|█▏        | 92650/768572 [1:03:05<5:46:14, 32.54it/s]

解析出错


 12%|█▏        | 92700/768572 [1:03:07<5:27:00, 34.45it/s]

解析出错


 12%|█▏        | 92750/768572 [1:03:08<5:10:16, 36.30it/s]

解析出错


 13%|█▎        | 99200/768572 [1:07:45<11:17:07, 16.48it/s]

连接错误: SOCKSHTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=5)
解析出错


 13%|█▎        | 99950/768572 [1:08:29<7:53:02, 23.56it/s] 

output_2已成功保存了


 14%|█▍        | 107200/768572 [1:14:00<11:58:45, 15.34it/s]

连接错误: SOCKSHTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=5)
解析出错


 17%|█▋        | 128450/768572 [1:29:03<5:34:11, 31.92it/s] 

这次返回地有点少,可能出错了 4


 17%|█▋        | 129750/768572 [1:29:46<5:33:20, 31.94it/s]

这次返回地有点少,可能出错了 4


 17%|█▋        | 130550/768572 [1:30:13<5:34:45, 31.77it/s]

这次返回地有点少,可能出错了 3


 17%|█▋        | 130850/768572 [1:30:23<6:02:48, 29.30it/s]

这次返回地有点少,可能出错了 4


 18%|█▊        | 139650/768572 [1:36:36<6:38:29, 26.30it/s]

这次返回地有点少,可能出错了 2


 18%|█▊        | 139700/768572 [1:36:37<6:15:07, 27.94it/s]

解析出错


 18%|█▊        | 139750/768572 [1:36:39<5:54:45, 29.54it/s]

解析出错


 18%|█▊        | 139800/768572 [1:36:40<5:39:09, 30.90it/s]

这次返回地有点少,可能出错了 1


 18%|█▊        | 139850/768572 [1:36:42<5:30:13, 31.73it/s]

解析出错


 18%|█▊        | 139900/768572 [1:36:43<5:07:47, 34.04it/s]

解析出错


 18%|█▊        | 139950/768572 [1:36:44<5:15:15, 33.23it/s]

这次返回地有点少,可能出错了 2


 18%|█▊        | 140000/768572 [1:36:46<5:16:29, 33.10it/s]

这次返回地有点少,可能出错了 1


 18%|█▊        | 140050/768572 [1:36:47<5:05:57, 34.24it/s]

这次返回地有点少,可能出错了 3


 18%|█▊        | 140100/768572 [1:36:49<4:57:58, 35.15it/s]

这次返回地有点少,可能出错了 1


 18%|█▊        | 141050/768572 [1:37:21<5:36:49, 31.05it/s]

请求失败，状态码: 400
解析出错


 18%|█▊        | 141100/768572 [1:37:23<5:11:17, 33.59it/s]

请求失败，状态码: 400
解析出错


 18%|█▊        | 141200/768572 [1:37:25<4:48:51, 36.20it/s]

请求失败，状态码: 400
解析出错


 18%|█▊        | 141250/768572 [1:37:26<4:37:57, 37.61it/s]

请求失败，状态码: 400
解析出错


 18%|█▊        | 141300/768572 [1:37:28<4:25:28, 39.38it/s]

请求失败，状态码: 400
解析出错


 18%|█▊        | 141350/768572 [1:37:29<4:17:27, 40.60it/s]

请求失败，状态码: 400
解析出错


 19%|█▉        | 147300/768572 [1:41:42<6:56:19, 24.87it/s] 

解析出错


 19%|█▉        | 147350/768572 [1:41:43<6:19:14, 27.30it/s]

解析出错


 19%|█▉        | 147400/768572 [1:41:45<6:03:52, 28.45it/s]

解析出错


 20%|█▉        | 149950/768572 [1:43:50<7:19:08, 23.48it/s] 

output_3已成功保存了


 20%|█▉        | 153400/768572 [1:46:20<6:16:05, 27.26it/s] 

解析出错


 20%|█▉        | 153450/768572 [1:46:21<5:59:12, 28.54it/s]

解析出错


 20%|█▉        | 153500/768572 [1:46:23<5:49:55, 29.30it/s]

解析出错


 20%|█▉        | 153700/768572 [1:46:35<10:13:55, 16.69it/s]

连接错误: SOCKSHTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=5)
解析出错


 26%|██▌       | 199950/768572 [2:20:52<6:29:41, 24.32it/s] 

output_4已成功保存了


 27%|██▋       | 207400/768572 [2:26:22<6:17:44, 24.76it/s] 

这次返回地有点少,可能出错了 1


 29%|██▉       | 223150/768572 [2:37:28<8:09:50, 18.56it/s] 

连接错误: SOCKSHTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=5)
解析出错


 29%|██▉       | 225950/768572 [2:39:15<5:05:28, 29.60it/s]

这次返回地有点少,可能出错了 2


 29%|██▉       | 226150/768572 [2:39:21<4:38:16, 32.49it/s]

这次返回地有点少,可能出错了 4


 29%|██▉       | 226550/768572 [2:39:37<6:55:43, 21.73it/s]

这次返回地有点少,可能出错了 4


 30%|██▉       | 227500/768572 [2:40:08<4:21:47, 34.45it/s]

这次返回地有点少,可能出错了 4


 30%|██▉       | 228750/768572 [2:40:54<5:24:49, 27.70it/s]

解析出错


 30%|██▉       | 228800/768572 [2:40:55<5:01:04, 29.88it/s]

解析出错


 30%|██▉       | 228850/768572 [2:40:56<4:39:18, 32.21it/s]

解析出错


 30%|██▉       | 228900/768572 [2:40:58<4:34:23, 32.78it/s]

解析出错


 30%|██▉       | 228950/768572 [2:40:59<4:20:41, 34.50it/s]

解析出错


 30%|██▉       | 229000/768572 [2:41:00<4:25:44, 33.84it/s]

解析出错


 30%|██▉       | 229050/768572 [2:41:02<4:12:11, 35.66it/s]

解析出错


 30%|██▉       | 229100/768572 [2:41:03<4:09:58, 35.97it/s]

解析出错


 30%|██▉       | 229150/768572 [2:41:04<4:03:16, 36.96it/s]

解析出错


 30%|██▉       | 229200/768572 [2:41:06<3:54:30, 38.33it/s]

解析出错


 30%|██▉       | 229250/768572 [2:41:07<3:54:20, 38.36it/s]

解析出错


 30%|██▉       | 229300/768572 [2:41:08<4:02:51, 37.01it/s]

解析出错


 30%|██▉       | 229350/768572 [2:41:10<3:58:23, 37.70it/s]

解析出错


 30%|██▉       | 229400/768572 [2:41:11<4:08:30, 36.16it/s]

解析出错


 30%|███       | 230600/768572 [2:42:00<4:58:25, 30.04it/s]

这次返回地有点少,可能出错了 4


 30%|███       | 230650/768572 [2:42:01<4:45:55, 31.36it/s]

这次返回地有点少,可能出错了 4


 30%|███       | 230800/768572 [2:42:07<4:57:01, 30.17it/s]

这次返回地有点少,可能出错了 4


 30%|███       | 230850/768572 [2:42:08<5:04:27, 29.44it/s]

这次返回地有点少,可能出错了 3


 32%|███▏      | 243450/768572 [2:51:03<5:47:08, 25.21it/s]

解析出错


 32%|███▏      | 247150/768572 [2:53:38<5:12:13, 27.83it/s]

解析出错


 33%|███▎      | 249950/768572 [2:55:35<5:00:35, 28.76it/s]

output_5已成功保存了


 35%|███▍      | 266150/768572 [3:07:18<5:18:09, 26.32it/s] 

解析出错


 35%|███▍      | 266200/768572 [3:07:20<4:57:44, 28.12it/s]

解析出错


 35%|███▍      | 266250/768572 [3:07:21<4:39:32, 29.95it/s]

解析出错


 36%|███▌      | 272900/768572 [3:11:41<4:26:12, 31.03it/s]

这次返回地有点少,可能出错了 4


 36%|███▌      | 273550/768572 [3:12:02<4:47:20, 28.71it/s]

这次返回地有点少,可能出错了 4


 36%|███▌      | 273600/768572 [3:12:03<4:22:50, 31.39it/s]

这次返回地有点少,可能出错了 4


 36%|███▌      | 276350/768572 [3:14:03<5:15:14, 26.02it/s]

解析出错


 37%|███▋      | 282100/768572 [3:18:20<5:34:16, 24.25it/s]

解析出错


 37%|███▋      | 285400/768572 [3:20:49<7:58:55, 16.81it/s]

连接错误: SOCKSHTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=5)
解析出错


 38%|███▊      | 295350/768572 [3:28:06<4:59:06, 26.37it/s]

这次返回地有点少,可能出错了 3


 38%|███▊      | 295450/768572 [3:28:09<4:57:00, 26.55it/s]

这次返回地有点少,可能出错了 3


 38%|███▊      | 295750/768572 [3:28:20<4:43:52, 27.76it/s]

这次返回地有点少,可能出错了 3


 38%|███▊      | 295850/768572 [3:28:23<4:26:57, 29.51it/s]

这次返回地有点少,可能出错了 4


 39%|███▊      | 296050/768572 [3:28:29<4:12:51, 31.15it/s]

这次返回地有点少,可能出错了 2


 39%|███▊      | 296150/768572 [3:28:33<4:18:28, 30.46it/s]

这次返回地有点少,可能出错了 3


 39%|███▊      | 296200/768572 [3:28:34<4:11:20, 31.32it/s]

这次返回地有点少,可能出错了 4


 39%|███▉      | 299950/768572 [3:31:36<5:14:12, 24.86it/s]

output_6已成功保存了


 39%|███▉      | 303500/768572 [3:34:21<5:13:43, 24.71it/s] 

解析出错


 40%|████      | 308200/768572 [3:37:47<4:22:57, 29.18it/s]

这次返回地有点少,可能出错了 4


 42%|████▏     | 324500/768572 [3:50:06<3:47:37, 32.52it/s] 

这次返回地有点少,可能出错了 4


 44%|████▍     | 337750/768572 [3:59:08<4:48:22, 24.90it/s]

解析出错


 44%|████▍     | 337800/768572 [3:59:10<4:33:53, 26.21it/s]

解析出错


 44%|████▍     | 337850/768572 [3:59:11<4:09:16, 28.80it/s]

解析出错


 44%|████▍     | 339050/768572 [4:00:01<5:13:28, 22.84it/s]

解析出错


 44%|████▍     | 339100/768572 [4:00:03<4:38:02, 25.74it/s]

解析出错


 45%|████▍     | 343500/768572 [4:03:11<4:26:45, 26.56it/s]

解析出错


 45%|████▍     | 343550/768572 [4:03:12<4:02:08, 29.26it/s]

解析出错


 45%|████▍     | 343600/768572 [4:03:13<3:49:43, 30.83it/s]

解析出错


 45%|████▍     | 343650/768572 [4:03:15<3:52:25, 30.47it/s]

解析出错


 45%|████▍     | 343700/768572 [4:03:16<3:38:36, 32.39it/s]

解析出错


 45%|████▍     | 343750/768572 [4:03:18<3:33:04, 33.23it/s]

解析出错


 46%|████▌     | 349950/768572 [4:08:07<5:08:02, 22.65it/s]

output_7已成功保存了


 49%|████▊     | 373550/768572 [4:24:50<4:20:58, 25.23it/s] 

解析出错


 49%|████▊     | 373600/768572 [4:24:52<3:52:17, 28.34it/s]

解析出错


 49%|████▊     | 373650/768572 [4:24:53<3:36:25, 30.41it/s]

解析出错


 49%|████▊     | 373700/768572 [4:24:54<3:29:11, 31.46it/s]

解析出错


 49%|████▊     | 373750/768572 [4:24:56<3:23:15, 32.37it/s]

解析出错


 49%|████▊     | 373800/768572 [4:24:57<3:08:39, 34.88it/s]

解析出错


 49%|████▊     | 373850/768572 [4:24:58<3:06:38, 35.25it/s]

解析出错


 49%|████▊     | 373900/768572 [4:25:00<3:00:58, 36.35it/s]

解析出错


 49%|████▊     | 373950/768572 [4:25:01<3:08:51, 34.82it/s]

解析出错


 49%|████▉     | 379500/768572 [4:29:00<6:34:12, 16.45it/s]

连接错误: SOCKSHTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=5)
解析出错


 50%|█████     | 384300/768572 [4:32:42<3:56:49, 27.04it/s]

解析出错


 52%|█████▏    | 396850/768572 [4:41:41<3:32:09, 29.20it/s]

这次返回地有点少,可能出错了 4


 52%|█████▏    | 399950/768572 [4:44:09<4:28:04, 22.92it/s]

output_8已成功保存了


 57%|█████▋    | 437100/768572 [5:10:20<3:35:48, 25.60it/s] 

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437150/768572 [5:10:21<3:09:33, 29.14it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437250/768572 [5:10:24<2:59:51, 30.70it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437300/768572 [5:10:25<2:43:40, 33.73it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437350/768572 [5:10:26<2:32:30, 36.20it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437400/768572 [5:10:27<2:28:21, 37.20it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437450/768572 [5:10:29<2:31:25, 36.44it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437500/768572 [5:10:30<2:27:03, 37.52it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437550/768572 [5:10:31<2:22:18, 38.77it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437600/768572 [5:10:33<2:18:39, 39.78it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437650/768572 [5:10:34<2:17:26, 40.13it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437700/768572 [5:10:35<2:12:36, 41.58it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437800/768572 [5:10:38<2:37:38, 34.97it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437850/768572 [5:10:39<2:29:05, 36.97it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437900/768572 [5:10:41<2:23:08, 38.50it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 437950/768572 [5:10:42<2:18:04, 39.91it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438000/768572 [5:10:43<2:15:37, 40.62it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438050/768572 [5:10:44<2:13:24, 41.29it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438100/768572 [5:10:46<2:24:55, 38.01it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438150/768572 [5:10:47<2:23:29, 38.38it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438200/768572 [5:10:48<2:20:53, 39.08it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438250/768572 [5:10:49<2:19:08, 39.57it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438300/768572 [5:10:51<2:16:46, 40.24it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438350/768572 [5:10:52<2:18:15, 39.81it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438400/768572 [5:10:53<2:19:18, 39.50it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438450/768572 [5:10:54<2:16:19, 40.36it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438550/768572 [5:10:58<2:32:53, 35.98it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438600/768572 [5:10:59<2:26:53, 37.44it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438650/768572 [5:11:00<2:22:47, 38.51it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438700/768572 [5:11:01<2:21:56, 38.74it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438750/768572 [5:11:02<2:17:00, 40.12it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438800/768572 [5:11:04<2:13:11, 41.26it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438850/768572 [5:11:05<2:12:06, 41.60it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 438900/768572 [5:11:06<2:12:22, 41.51it/s]

请求失败，状态码: 400
解析出错


 57%|█████▋    | 440250/768572 [5:12:03<3:18:07, 27.62it/s]

解析出错


 59%|█████▊    | 449950/768572 [5:19:31<3:48:53, 23.20it/s]

output_9已成功保存了


 63%|██████▎   | 482050/768572 [5:43:18<5:13:17, 15.24it/s] 

连接错误: SOCKSHTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=5)
解析出错


 63%|██████▎   | 482100/768572 [5:43:24<6:25:49, 12.37it/s]

连接错误: SOCKSHTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=5)
解析出错


 63%|██████▎   | 482150/768572 [5:43:29<7:16:56, 10.93it/s]

连接错误: SOCKSHTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=5)
解析出错


 65%|██████▌   | 499950/768572 [5:57:23<3:07:57, 23.82it/s]

output_10已成功保存了


 72%|███████▏  | 549950/768572 [6:36:15<2:42:02, 22.49it/s] 

output_11已成功保存了


 78%|███████▊  | 599950/768572 [7:13:57<2:05:14, 22.44it/s] 

output_12已成功保存了


 85%|████████▍ | 650000/768572 [7:50:37<5:47:31,  5.69it/s]

output_13已成功保存了


 90%|█████████ | 692700/768572 [8:21:27<49:20, 25.63it/s]  

解析出错


 90%|█████████ | 692750/768572 [8:21:28<46:57, 26.91it/s]

解析出错


 90%|█████████ | 692800/768572 [8:21:30<42:51, 29.46it/s]

解析出错


 90%|█████████ | 692850/768572 [8:21:31<42:40, 29.57it/s]

解析出错


 90%|█████████ | 692900/768572 [8:21:33<39:07, 32.23it/s]

这次返回地有点少,可能出错了 3


 91%|█████████ | 699950/768572 [8:27:01<52:07, 21.94it/s]  

output_14已成功保存了


 93%|█████████▎| 716000/768572 [8:39:03<54:29, 16.08it/s]  

连接错误: SOCKSHTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out. (read timeout=5)
解析出错


 98%|█████████▊| 749950/768572 [9:03:33<12:56, 23.97it/s]

output_15已成功保存了


100%|██████████| 768572/768572 [9:17:06<00:00, 22.99it/s]


#### 处理剩余的

In [18]:
sub_file_index

16

In [19]:
# 保存最后一批数据
if linked_data:
    file_name = f'../openalex_connect_result/dblp_parsed_{file_index}/output_{sub_file_index}.json'  # 生成新的文件名
    with open(file_name, 'w') as f:  # 将 linked_data 列表的内容输出到本地文件
        json.dump(linked_data, f)
    print(f'output_{sub_file_index}已成功保存了')

output_16已成功保存了
