#### 获取待下载论文的网页链接

In [6]:
# 先获取对应期刊论文的网页链接

journals = ['INTERNATIONAL JOURNAL OF INFORMATION MANAGEMENT',
            'GOVERNMENT INFORMATION QUARTERLY',
            'INFORMATION PROCESSING & MANAGEMENT',
            'TELEMATICS AND INFORMATICS',
            'INFORMATION & MANAGEMENT',
            'Journal of Informetrics',
            'Information and Organization',
            'JOURNAL OF STRATEGIC INFORMATION SYSTEMS',
            'JOURNAL OF THE AMERICAN MEDICAL INFORMATICS ASSOCIATION',
            'TELECOMMUNICATIONS POLICY',
            'LIBRARY & INFORMATION SCIENCE RESEARCH']

In [3]:
# 测试requests put方法作用
import requests
import json

url = 'https://api.elsevier.com/content/search/sciencedirect'

headers = {
    "X-ELS-APIKEY": '5245a5ea0be97233a73906d8b805c460',
    "X-ELS-Insttoken": '808e542de03a7a118f187de5081d8521'
}

data = {
    'title':"\"Predicting the evolution of scientific communities by interpretable machine learning approaches\"",
    'pub':"\"Journal of Informetrics\"",
}

response = requests.put(url, data=json.dumps(data), headers=headers)

if response.status_code != 200:
    print(response.json())

##### 单个期刊上的处理

和LIS期刊上处理方式一致，但由于计算机学科论文数量较多，因此需要进行过滤

    1. 每个期刊先过滤，保留下来1000条左右的数据
    2. 获取Pii, 使用采集器采集数据
    3. 过滤掉爬取不到highlight的文章，大概每个期刊保留下800条左右数据
    4. 再进一步对每个期刊的论文数过滤，尽量和LIS的数据量保持一致

In [14]:
# 将下载后的pii复制到原始文件当中
import pandas as pd

df = pd.read_csv('../data/Elsevier-CS/PR/PR.csv')

# 第一种处理方式 即先爬取到Pii
# columns = ['Title', 'Year', 'Abstract', 'Author Keywords','Pii']
# 第二种处理方式 先过滤
columns = ['Title', 'Year', 'Abstract', 'Author Keywords']
df = df[columns]
print(len(df))
df = df.dropna()
print(len(df))
df = df[df['Abstract'] != '[No abstract available]']
print(len(df))
df.head()

5031
5016
5016


Unnamed: 0,Title,Year,Abstract,Author Keywords
0,A self-adaptive soft-recoding strategy for per...,2023,The technique of error-correcting output codes...,Error-correcting output code; Multiclass class...
1,WSDS-GAN: A weak-strong dual supervised learni...,2023,Underwater Image Enhancement (UIE) is a crucia...,CycleGAN; Deep learning; Two-stage learning; U...
2,Construction of a feature enhancement network ...,2023,"Limited by the size, location, number of sampl...",Collision detection; FENet; Granular computing...
3,Haar wavelet downsampling: A simple but effect...,2023,Downsampling operations such as max pooling or...,Downsampling; Haar wavelet; Information entrop...
4,Underwater object classification combining SAS...,2023,Combining synthetic aperture sonar (SAS) image...,Contour-based features; Feature extraction; Fo...


In [15]:
# 保留关键词长度在一定范围内的论文集合
keywords = df['Author Keywords'].tolist()
keyword_lengths = [len(keyword.split(';')) for keyword in keywords]

min = 3
max = 6
cnt = 0
for length in keyword_lengths:
    if length >= min and length <=max:
        cnt += 1

print(cnt)
print(cnt/len(keyword_lengths))

# 获取对应长度关键词集合的论文标题
titles = df['Title'].tolist()
sub_titles = []

for i, title in enumerate(titles):
    if keyword_lengths[i] >= min and keyword_lengths[i] <=max:
        sub_titles.append(title)
print(len(sub_titles))
print(sub_titles[:5])

4634
0.9238437001594896
4634
['A self-adaptive soft-recoding strategy for performance improvement of error-correcting output codes', 'WSDS-GAN: A weak-strong dual supervised learning method for underwater image enhancement', 'Construction of a feature enhancement network for small object detection', 'Haar wavelet downsampling: A simple but effective downsampling module for semantic segmentation', 'Underwater object classification combining SAS and transferred optical-to-SAS Imagery']


In [16]:
df = df[df['Title'].isin(sub_titles)]
df.to_excel('../data/Elsevier-CS/PR/PR_filter.xlsx', index=False)

In [26]:
# 保存1200条左右数据 2012~2023 每一年100左右
df = pd.read_excel('../data/Elsevier-CS/IF/IF_filter.xlsx')
print(len(df))
year_counts = df['Year'].value_counts()

# reserved_size 设为1000也可以，可以在Pii爬取或者后期过滤的时候再处理
reserved_size = 1200
reserved_counts = year_counts*(reserved_size/len(df))
reserved_counts = reserved_counts.astype(int)

reserved_df = pd.DataFrame()

for year, count in reserved_counts.items():
    year_df = df[df['Year'] == year]
    reserved_year_df = year_df.sample(n=count, replace=False)
    reserved_df = pd.concat([reserved_df, reserved_year_df], ignore_index=True)

print(len(reserved_df))
reserved_df.to_excel('../data/Elsevier-CS/IF/IF_filter.xlsx', index=False)

1252
1194


In [12]:
import pandas as pd

df = pd.read_excel('../data/Elsevier-CS/PR/PR_filter.xlsx')
titles = df['Title'].tolist()
piis = []

In [8]:
# 补充Pii
# 第二种处理方式
from tqdm import tqdm

offset = len(piis)

for title in tqdm(titles[offset:]):
    data = {
        'title':title,
        # 'pub':"\"Journal of Informetrics\"",   
    }

    response = requests.put(url, data=json.dumps(data), headers=headers)
    if response.status_code != 200:
        piis.append('error')
    elif response.json()['resultsFound'] == 0:
        piis.append('not found')
    else:
        piis.append(response.json()['results'][0]['pii'])

100%|██████████| 1193/1193 [14:49<00:00,  1.34it/s]


In [14]:
prefix_url = 'https://www.sciencedirect.com/science/article/pii/'
df['Pii'] = piis
df = df[df['Pii'] != 'not found']
print(len(df))
df = df[df['Pii'] != 'error']
print(len(df))

1192
1064


In [15]:
df['Pii'] = [prefix_url + pii for pii in df['Pii'].tolist()]
df.to_excel('../data/Elsevier-CS/PR/PR_filter.xlsx', index=False)

##### 利用采集器

In [1]:
import pandas as pd

df_crawl = pd.read_excel('../data/Elsevier-CS/IF/IF_highlight.xlsx')
df_meta = pd.read_excel('../data/Elsevier-CS/IF/IF_filter.xlsx')

piis_crawl = df_crawl['Pii'].tolist()
piis_crawl = [pii.replace('abs/', '') for pii in piis_crawl]
df_crawl['Pii'] = piis_crawl

df = pd.DataFrame()
df = pd.merge(df_meta, df_crawl, on='Pii')

print(len(df_crawl))
print(len(df_meta))
print(len(df))

1193
1193
1197


In [2]:
df = df.dropna()
print(len(df))
df = df.drop_duplicates()
print(len(df))

highlights = df['Highlights'].tolist()
empty_nums = 0
for text in highlights:
    if type(text) == float:
        empty_nums += 1

print(empty_nums)

1035
1033
0


In [3]:
new_highlights = []
for highlight in highlights:
    new_highlight = highlight.replace('highlights', '')
    items = new_highlight.split("•")[1:]
    highlight = ';'.join(items)
    new_highlights.append(highlight)

df['Highlights'] = new_highlights
df.to_excel('../data/Elsevier-CS/IF/IF_expe.xlsx', index=False)

领域数据集合并

In [4]:
import pandas as pd

paths = ['PR/PR_expe.xlsx', 'JSS/JSS_expe.xlsx', 'JPDC/JPDC_expe.xlsx','IF/IF_expe.xlsx']
prefix_path = '../data/Elsevier-CS'

df = pd.DataFrame()
for path in paths:
    df = pd.concat([df, pd.read_excel(prefix_path + '/' + path)], ignore_index=True)
    print(len(df))

df = df.dropna()
df = df.drop_duplicates()
print(len(df))
df.head()

954
1948
2950
3983
3982


Unnamed: 0,Title,Year,Abstract,Author Keywords,Pii,Highlights
0,Factorized multi-Graph matching,2023,"In recent years, multi-graph matching has beco...",Factorization; Graph matching; Multi-graph mat...,https://www.sciencedirect.com/science/article/...,The equivalence between the two kinds of multi...
1,Invariance encoding in sliced-Wasserstein spac...,2023,Deep convolutional neural networks (CNNs) are ...,Generative model; Invariance learning; Mathema...,https://www.sciencedirect.com/science/article/...,We present a mathematical framework to learn i...
2,RA-YOLOX: Re-parameterization align decoupled ...,2023,YOLOX is a state-of-the-art one-stage object d...,Decoupled head; Label assignment; Object detec...,https://www.sciencedirect.com/science/article/...,We propose a lightweight RepA decoupled head t...
3,Weakly Supervised Instance Segmentation via Ca...,2023,Deep convolutional neural networks (DCNN) trai...,Centerness; Coarse localization annotation; In...,https://www.sciencedirect.com/science/article/...,A novel two-branch DCNN is constructed to perf...
4,Multi-label feature selection via robust flexi...,2023,Multi-label feature selection is an efficient ...,Classification; Feature selection; Multi-label...,https://www.sciencedirect.com/science/article/...,A regularization norm named robust flexible sp...


In [6]:
"""
    防止有部分文章的关键词数量在6个以上，此处再进行一次过滤
    此外，这里也可以对数据量进一步抽样，保持和LIS数据量大体上一致
    LIS: 2589   CS: 3000
"""
keywords = df['Author Keywords'].tolist()
keyword_lengths = [len(keyword.split(';')) for keyword in keywords]

min = 3
max = 6
cnt = 0
for length in keyword_lengths:
    if length >= min and length <=max:
        cnt += 1

print(cnt)
print(cnt/len(keyword_lengths))

# 获取对应长度关键词集合的论文标题
titles = df['Title'].tolist()
sub_titles = []

for i, title in enumerate(titles):
    if keyword_lengths[i] >= min and keyword_lengths[i] <=max:
        sub_titles.append(title)

df = df[df['Title'].isin(sub_titles)]
print(len(df))
df.to_excel('../data/Elsevier-CS/Texts.xlsx', index=False)

3982
1.0
3982


In [7]:
year_counts = df['Year'].value_counts()

# reserved_size 设为1000也可以，可以在Pii爬取或者后期过滤的时候再处理
reserved_size = 3000
reserved_counts = year_counts*(reserved_size/len(df))
reserved_counts = reserved_counts.astype(int)

reserved_df = pd.DataFrame()

for year, count in reserved_counts.items():
    year_df = df[df['Year'] == year]
    reserved_year_df = year_df.sample(n=count, replace=False)
    reserved_df = pd.concat([reserved_df, reserved_year_df], ignore_index=True)

print(len(reserved_df))
reserved_df.to_excel('../data/Elsevier-CS/Texts_3000.xlsx', index=False)

2996


In [8]:
import json
keywords = reserved_df['Author Keywords'].tolist()
piis = reserved_df['Pii'].tolist()

pii_to_words = {}
for i, pii in enumerate(piis):
    pii_to_words[pii] = []
    items = keywords[i].split(';')
    items = [item.strip() for item in items]
    pii_to_words[pii].extend(items)

with open('../data/Elsevier-CS/Keywords.json', 'w') as f:
    json.dump(pii_to_words, f)