#### Step 1 导入相关包

In [1]:
from tqdm import tqdm
from collections import defaultdict,Counter
import pickle
import numpy as np
from cso_classifier import CSOClassifier

import re

#### Step 2 加载数据

In [2]:
with open('./data/filter_ai_papers.pkl', 'rb') as f:
    filter_ai_papers = pickle.load(f)

In [3]:
with open('./data/filter_ai_papers_0312.pkl', 'rb') as f:
    full_filter_ai_papers = pickle.load(f)

In [4]:
# 计算差集（找出新增的论文）
new_papers = {paper_id: paper_info for paper_id, paper_info in full_filter_ai_papers.items() if paper_id not in filter_ai_papers}

In [7]:
# 处理新增的论文，转换为所需格式
new_papers_title_abstract = {}
for paper_id, paper_info in tqdm(new_papers.items()):
    new_papers_title_abstract[paper_id] = {
        'title': paper_info.get('title', '').replace('<i>', '').replace('</i>', ''),  # 去除HTML标签
        'abstract': paper_info.get('abstract', ''),
    }

100%|██████████| 321976/321976 [00:00<00:00, 1003034.97it/s]


In [3]:
# 转换为所需格式
papers_title_abstract = {}
for paper_id, paper_info in tqdm(filter_ai_papers.items()):
    papers_title_abstract[paper_id] = {
        'title': paper_info.get('title', '').replace('<i>', '').replace('</i>', ''),  # 去除HTML标签
        'abstract': paper_info.get('abstract', ''),  
    }

100%|██████████| 1055544/1055544 [00:00<00:00, 1090051.21it/s]


#### Step 3 识别论文

In [9]:
# 初始化分类器
cc = CSOClassifier(workers=1, modules="both", enhancement="first", explanation=True,fast_classification=True)

In [12]:
def process_in_batches(papers, batch_size=20, save_interval=100000, save_path='./data/cso_result.pkl'):
    # 获取论文的ID列表
    paper_ids = list(papers.keys())
    total_papers = len(paper_ids)
    results = {}
    papers_processed = 0  # 记录已处理的论文数量

    # 按批次处理论文
    for i in tqdm(range(0, total_papers, batch_size)):
        # 获取当前批次的论文ID
        batch_ids = paper_ids[i:i + batch_size]
        batch_papers = {paper_id: papers[paper_id] for paper_id in batch_ids}

        # 调用 batch_run 方法处理当前批次
        batch_result = cc.batch_run(batch_papers)
        results.update(batch_result)  # 将结果合并到总结果中

        # 更新已处理的论文数量
        papers_processed += len(batch_ids)

        # 每处理一定数量的论文，保存结果到本地
        if papers_processed >= save_interval:
            with open(save_path, "wb") as f:
                pickle.dump(results, f)
            print(f"Saved results to {save_path} after processing {papers_processed} papers.")
            papers_processed = 0  # 重置计数器

    # 保存最终结果
    if results:
        with open(save_path, "wb") as f:
            pickle.dump(results, f)
        print(f"Final results saved to {save_path}.")

    return results

In [13]:
# 处理论文数据，分批次运行
batch_size = 20
save_interval = 100000
save_path = './data/cso_result.pkl'  # 保存路径
results = process_in_batches(papers_title_abstract, batch_size=batch_size, save_interval=save_interval, save_path=save_path)

# 打印结果
print("处理完了")

  9%|▉         | 5000/52778 [7:07:41<71:19:16,  5.37s/it]

Saved results to ./data/cso_result.pkl after processing 100000 papers.


 19%|█▉        | 10000/52778 [14:18:00<66:39:26,  5.61s/it]

Saved results to ./data/cso_result.pkl after processing 100000 papers.


 28%|██▊       | 15000/52778 [21:34:42<59:43:07,  5.69s/it]

Saved results to ./data/cso_result.pkl after processing 100000 papers.


 38%|███▊      | 20000/52778 [28:44:36<58:25:43,  6.42s/it]

Saved results to ./data/cso_result.pkl after processing 100000 papers.


 47%|████▋     | 25000/52778 [35:51:58<51:39:05,  6.69s/it]

Saved results to ./data/cso_result.pkl after processing 100000 papers.


 57%|█████▋    | 30000/52778 [42:59:39<46:44:09,  7.39s/it]

Saved results to ./data/cso_result.pkl after processing 100000 papers.


 66%|██████▋   | 35000/52778 [50:27:05<34:11:56,  6.93s/it]

Saved results to ./data/cso_result.pkl after processing 100000 papers.


 76%|███████▌  | 40000/52778 [57:52:00<30:08:44,  8.49s/it]

Saved results to ./data/cso_result.pkl after processing 100000 papers.


 85%|████████▌ | 45000/52778 [65:36:34<18:00:49,  8.34s/it]

Saved results to ./data/cso_result.pkl after processing 100000 papers.


 95%|█████████▍| 50000/52778 [73:25:17<6:55:00,  8.96s/it] 

Saved results to ./data/cso_result.pkl after processing 100000 papers.


100%|██████████| 52778/52778 [77:42:54<00:00,  5.30s/it]  


Final results saved to ./data/cso_result.pkl.
处理完了


#### Step 4 识别增补论文

In [16]:
def process_in_batches_add(papers, batch_size=20, save_interval=100000, save_path='./data/cso_result_add.pkl'):
    # 获取论文的ID列表
    paper_ids = list(papers.keys())
    total_papers = len(paper_ids)
    results = {}
    papers_processed = 0  # 记录已处理的论文数量

    # 按批次处理论文
    for i in tqdm(range(0, total_papers, batch_size)):
        # 获取当前批次的论文ID
        batch_ids = paper_ids[i:i + batch_size]
        batch_papers = {paper_id: papers[paper_id] for paper_id in batch_ids}

        try:
            # 调用 batch_run 方法处理当前批次
            batch_result = cc.batch_run(batch_papers)
            results.update(batch_result)  # 将结果合并到总结果中
        except Exception as e:
            # 捕获并记录错误信息
            print(f"Error processing batch starting at paper ID {batch_ids[0]}: {e}")
            continue  # 跳过当前批次，继续处理下一个批次

        # 更新已处理的论文数量
        papers_processed += len(batch_ids)

        # 每处理一定数量的论文，保存结果到本地
        if papers_processed >= save_interval:
            with open(save_path, "wb") as f:
                pickle.dump(results, f)
            print(f"Saved results to {save_path} after processing {papers_processed} papers.")
            papers_processed = 0  # 重置计数器

    # 保存最终结果
    if results:
        with open(save_path, "wb") as f:
            pickle.dump(results, f)
        print(f"Final results saved to {save_path}.")

    return results

In [17]:
# 处理论文数据，分批次运行
batch_size = 20
save_interval = 100000
save_path = './data/cso_result_add.pkl'  # 保存路径
results = process_in_batches_add(new_papers_title_abstract, batch_size=batch_size, save_interval=save_interval, save_path=save_path)

# 打印结果
print("处理完了")

 31%|███       | 5000/16099 [7:44:07<18:47:12,  6.09s/it]

Saved results to ./data/cso_result_add.pkl after processing 100000 papers.


 62%|██████▏   | 10000/16099 [15:36:56<11:12:57,  6.62s/it]

Saved results to ./data/cso_result_add.pkl after processing 100000 papers.


 93%|█████████▎| 15000/16099 [23:59:42<2:05:07,  6.83s/it] 

Saved results to ./data/cso_result_add.pkl after processing 100000 papers.


100%|██████████| 16099/16099 [25:50:05<00:00,  5.78s/it]  


Final results saved to ./data/cso_result_add.pkl.
处理完了


In [18]:
results

{'W4293022965': {'syntactic': ['classification models',
   'computer hardware',
   'neural networks',
   'demonstrations',
   'computing systems'],
  'semantic': ['classification methods',
   'computer hardware',
   'neural networks',
   'correlation analysis',
   'demonstrations',
   'random access memory',
   'computing systems'],
  'union': ['classification models',
   'classification methods',
   'computer hardware',
   'neural networks',
   'correlation analysis',
   'demonstrations',
   'random access memory',
   'computing systems'],
  'enhanced': ['computer systems',
   'computer science',
   'machine learning',
   'mathematics',
   'information systems',
   'education',
   'random access storage'],
  'explanation': {'demonstrations': ['demonstration'],
   'computing systems': ['computing systems'],
   'computer hardware': ['hardware',
    'calibrating hardware noise',
    'hardware noise',
    'hardware measurements',
    'calibrating hardware'],
   'neural networks': ['neural