# 数据收集

查找以下种类的论文

- Computer Science
- Medicine
- Physics
- Social Sciences
- Biology

引用量分配

- Low-cited: 30% seeds，引用数区间 1–5
- Mid-cited: 50% seeds，引用数区间 6–30
- High-cited: 20% seeds，引用数区间 >30


In [1]:
import requests
import pandas as pd
from time import sleep
import os

random_state = 42 # 随机种子保证实验重复
BASE_URL = 'https://api.semanticscholar.org/graph/v1'
API_KEY = os.getenv('API_KEY')

In [2]:
def getPaperByID(id):
    url = f'{BASE_URL}/paper/{id}'
    params = {"fields": "paperId,paperId,title,abstract,authors.name,authors.affiliations,year,publicationDate,venue,citationCount,referenceCount,influentialCitationCount,fieldsOfStudy,s2FieldsOfStudy,url,openAccessPdf,externalIds,corpusId,publicationDate,isOpenAccess,publicationTypes,journal,citationStyles,embedding.specter_v2,tldr"}
    resp = requests.get(url, params=params, headers={"x-api-key": API_KEY})
    resp.raise_for_status()
    return resp.json()  

In [5]:
def getCitations(id):
    limit = 10
    url = f"{BASE_URL}/paper/{id}/citations"
    params = {
        "paper_id": id,
        # "fields": "title,authors,abstract,year,journal",
        "fields": "title,contexts",
        "limit": limit,
    }
    resp = requests.get(url, params=params)
    resp.raise_for_status()
    return resp.json()["data"]

### 被引论文收集


查找5个分类，每个分类20个

高引论文3篇，中引论文10篇，低引论文5篇

2024年之后的论文

出版类型为期刊论文

论文记录：DOI，标题，一作，被引数量，摘要

In [6]:
category = [{
    "study": "Computer Science",
    "low": 1,
    "medium": 11,
    "high": 30
}, {
    "study": "Mathematics",
    "low": 1,
    "medium": 5,
    "high": 20
}, {
    "study": "Engineering",
    "low": 1,
    "medium": 10,
    "high": 50
}, {
    "study": "Biology",
    "low": 1,
    "medium": 20,
    "high": 100
}, {
    "study": "Medicine",
    "low": 1,
    "medium": 30,
    "high": 150
}]

In [7]:
def getPapersByStudyAndCiation(study: str, citation: int):
    url = f"{BASE_URL}/paper/search/bulk"
    params = {
        "fieldsOfStudy": study,
        "minCitationCount": citation,
        "year": "2024-",
        "fields": "title,year,citationCount",
        "publicationTypes": "JournalArticle",
        # "sort": "citationCount"
    }
    resp = requests.get(url, params=params)
    resp.raise_for_status()
    return pd.DataFrame(resp.json()["data"])

In [8]:
def getSampleByStudy(params):
    study = params['study']
    df = getPapersByStudyAndCiation(study, params['low'])
    low_cit_sample = df[df['citationCount'] < params['medium']].sample(
        n=6, random_state=random_state)
    low_cit_sample['citationCategory'] = 'low'
    sleep(1)
    df = getPapersByStudyAndCiation(study, params['medium'])
    medium_cit_sample = df[df['citationCount'] < params['high']].sample(
        n=10, random_state=random_state)
    medium_cit_sample['citationCategory'] = 'medium'
    sleep(1)
    df = getPapersByStudyAndCiation(study, params['high'])
    high_cit_sample = df.sample(n=4, random_state=random_state)
    high_cit_sample['citationCategory'] = 'high'
    return pd.concat([low_cit_sample, medium_cit_sample, high_cit_sample])

In [9]:
arr = []
for item in category:
    df = getSampleByStudy(item)
    df['study'] = item['study']
    arr.append(df)
    sleep(1)
df = pd.concat(arr)

In [10]:
df = df.reset_index(drop=True)

In [11]:
df.to_excel('sample.xlsx')