In [1]:
import pandas as pd
seeds = pd.read_csv('seeds_final_abstract_2018.csv')
references = pd.read_csv('references_final_abstract_2018.csv')
citations = pd.read_csv('citations_final_abstract_2018.csv')

In [2]:
# 找出 abstract 最长的行
seeds['abstract_length'] = seeds['final_abstract'].apply(eval).apply(len)  # 计算每个 abstract 的单词数

count_over_512 = (seeds['abstract_length'] > 512).sum()
print(f"长度超过512的abstract数量: {count_over_512}")
print(len(seeds))

长度超过512的abstract数量: 6
47


In [2]:
seeds.bert_final_abstract[1]

'study examine two important aspect late technology issue islamic finance relate artificial intelligence ( ai ) smart contract ai refer ability machine understand think learn similar way human indicate possibility use computer simulate human intelligence smart contract computer code run top block - chain contain set rule party smart contract agree interact main objective article evaluate operation ai smart contract make comparison operation ai smart contract article conclude ai smart contract huge impact future islamic finance industry'

1. 创建有向图引用网络

In [4]:
import networkx as nx
# 创建有向图
G = nx.DiGraph()

# 添加种子论文节点（红色）
for _, row in seeds.iterrows():
    G.add_node(row['id'], color='red', type='seed', title=row['title'], abstract=row['processed_abstract'], bert_abstract=row['bert_final_abstract'])

# 添加参考文献节点（蓝色）和边
for _, row in references.iterrows():
    if row['id'] not in G:
        G.add_node(row['id'], color='blue', type='reference', title=row['title'], abstract=row['processed_abstract'], bert_abstract=row['bert_final_abstract'])
    G.add_edge(row['seed_paper_id'], row['id'])

# 添加被引论文节点（绿色）和边
for _, row in citations.iterrows():
    if row['id'] not in G:
        G.add_node(row['id'], color='green', type='citation', title=row['title'], abstract=row['processed_abstract'], bert_abstract=row['bert_final_abstract'])
    G.add_edge(row['id'], row['seed_paper_id'])

2. 文本向量化

In [5]:
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import torch
import numpy as np

# 初始化BERT模型
tokenizer = BertTokenizer.from_pretrained('../models/bert-base-uncased')
model = BertModel.from_pretrained('../models/bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [6]:
# 批量BERT向量化
batch_size = 32
nodes = list(G.nodes())
for i in tqdm(range(0, len(nodes), batch_size)):
    batch_nodes = nodes[i:i+batch_size]
    texts = [(G.nodes[n]['bert_abstract']) for n in batch_nodes]
    
    inputs = tokenizer(texts, return_tensors='pt', 
                      truncation=True, max_length=512, 
                      padding='max_length').to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    for j, node in enumerate(batch_nodes):
        G.nodes[node]['bert_vector'] = cls_embeddings[j]

100%|██████████| 36/36 [15:41<00:00, 26.15s/it]


In [7]:
import pickle

# 保存整个网络
with open('bert_vectorized_network_2018.pkl', 'wb') as f:
    pickle.dump(G, f)

# 保存向量化模型(供后续使用)
with open('bert_vectorizer_model_2018.pkl', 'wb') as f:
    pickle.dump({
        'bert_model': model.state_dict(),
        'bert_tokenizer': tokenizer
    }, f)