#### 获取待下载论文的网页链接

In [6]:
# 先获取对应期刊论文的网页链接

# 1. 自己找的
journals = ['INTERNATIONAL JOURNAL OF INFORMATION MANAGEMENT',
            'GOVERNMENT INFORMATION QUARTERLY',
            'INFORMATION PROCESSING & MANAGEMENT',
            'TELEMATICS AND INFORMATICS',
            'INFORMATION & MANAGEMENT',
            'Journal of Informetrics',
            'Information and Organization',
            'JOURNAL OF STRATEGIC INFORMATION SYSTEMS',
            'JOURNAL OF THE AMERICAN MEDICAL INFORMATICS ASSOCIATION',
            'TELECOMMUNICATIONS POLICY',
            'LIBRARY & INFORMATION SCIENCE RESEARCH']

# 2. JIS论文所提供列表  参考JCR分区: https://jcr.clarivate.com/jcr/browse-category-list
journals = ['Government Information Quarterly',
            'Information & Management',
            'Information and Organization',
            'International Journal of Information Management',
            'Library & Information Science Research',
            'Telematics and Informatics',
            'The Journal of Academic Librarianship']

# journals = [journal.replace(' ','-') for journal in journals]

In [6]:
# 测试requests put方法作用
import requests
import json

url = 'https://api.elsevier.com/content/search/sciencedirect'

headers = {
    "X-ELS-APIKEY": '5245a5ea0be97233a73906d8b805c460',
    "X-ELS-Insttoken": '808e542de03a7a118f187de5081d8521'
}

data = {
    'title':"\"Predicting the evolution of scientific communities by interpretable machine learning approaches\"",
    'pub':"\"Journal of Informetrics\"",
}

response = requests.put(url, data=json.dumps(data), headers=headers)

if response.status_code != 200:
    print(response.json())

##### 单个期刊上的处理

In [None]:
# 根据论文标题检索对应的pii号
"""
此处其实有两种处理方式：

    1. 先获取所有论文对应的pii，然后只爬取符合要求的内容
    2. 先过滤符合要求的文章，然后获取对应pii

    后者更适用一点，因为前者会导致大量的pii号无用，浪费资源。
"""
import pandas as pd
from tqdm import tqdm

data_path = '../data/Elsevier-LIS/JOI/JOI.xlsx'
df = pd.read_excel(data_path)
titles = df['Title'].tolist()
piis = []

In [24]:
# 第一种处理方式
offset = len(piis)

for title in tqdm(titles[len(piis):]):
    data = {
        'title':title,
        'pub':"\"Journal of Informetrics\"",   
    }

    response = requests.put(url, data=json.dumps(data), headers=headers)
    if response.status_code != 200:
        piis.append('error')
    elif response.json()['resultsFound'] == 0:
        piis.append('not found')
    else:
        piis.append(response.json()['results'][0]['pii'])

with open('joi-paper-links.txt', 'w') as f:
    for pii in piis:
        f.write(pii)
        f.write('\n')

100%|██████████| 145/145 [02:34<00:00,  1.06s/it]


In [10]:
# 将下载后的pii复制到原始文件当中
import pandas as pd

# df = pd.read_excel('../data/Elsevier-LIS/JOI/JOI.xlsx')
# df = pd.read_csv('../data/Elsevier-LIS/LISR/LISR.csv')
# df = pd.read_csv('../data/Elsevier-LIS/IM/IM.csv')
# df = pd.read_csv('../data/Elsevier-LIS/IJIM/IJIM.csv')
df = pd.read_csv('../data/Elsevier-LIS/TI/TI.csv')

# 第一种处理方式 即先爬取到Pii
# columns = ['Title', 'Year', 'Abstract', 'Author Keywords','Pii']
# 第二种处理方式 先过滤
columns = ['Title', 'Year', 'Abstract', 'Author Keywords']
df = df[columns]
print(len(df))
df = df.dropna()
print(len(df))
df = df[df['Abstract'] != '[No abstract available]']
print(len(df))
df.head()

1146
1115
1113


Unnamed: 0,Title,Year,Abstract,Author Keywords
0,Exploring the relations of subjective and obje...,2023,We collected objective digital trace Instagram...,Instagram; Mental health; Objective measuremen...
1,I'll follow the fun: The extended investment m...,2022,Applying the Investment Model to the social me...,Content expertise of influencer; Influencer pl...
2,Social support for digital inclusion of women ...,2022,Despite strong evidence of its critical role i...,Digital inclusion; ICT; Social support; South ...
3,Emoji and visual complexity in health informat...,2023,Social media is a valuable tool that enables p...,Emoji; Health communication; Perceived enjoyme...
4,Who (and with whom) uses more emoji? Exploring...,2023,"Emoji use, despite being pervasive in digital ...",Computer-mediated communication; Context; Emoj...


In [11]:
# 保留关键词长度在一定范围内的论文集合
keywords = df['Author Keywords'].tolist()
keyword_lengths = [len(keyword.split(';')) for keyword in keywords]

min = 3
max = 6
cnt = 0
for length in keyword_lengths:
    if length >= min and length <=max:
        cnt += 1

print(cnt)
print(cnt/len(keyword_lengths))

# 获取对应长度关键词集合的论文标题
titles = df['Title'].tolist()
sub_titles = []

for i, title in enumerate(titles):
    if keyword_lengths[i] >= min and keyword_lengths[i] <=max:
        sub_titles.append(title)
print(len(sub_titles))
print(sub_titles[:5])

986
0.8858939802336029
986
["Exploring the relations of subjective and objective Instagram use on young adults' mental health", "I'll follow the fun: The extended investment model of social media influencers", 'Social support for digital inclusion of women in South African townships', 'Emoji and visual complexity in health information design: A moderated serial mediation model', 'Who (and with whom) uses more emoji? Exploring individual, relational, and motivational characteristics driving emoji use']


In [12]:
df = df[df['Title'].isin(sub_titles)]
df.to_excel('../data/Elsevier-LIS/TI/TI_filter.xlsx', index=False)

In [13]:
df = pd.read_excel('../data/Elsevier-LIS/TI/TI_filter.xlsx')
titles = df['Title'].tolist()
piis = []

In [14]:
# 补充Pii
# 第二种处理方式
from tqdm import tqdm

offset = len(piis)

for title in tqdm(titles[offset:]):
    data = {
        'title':title,
        # 'pub':"\"Journal of Informetrics\"",   
    }

    response = requests.put(url, data=json.dumps(data), headers=headers)
    if response.status_code != 200:
        piis.append('error')
    elif response.json()['resultsFound'] == 0:
        piis.append('not found')
    else:
        piis.append(response.json()['results'][0]['pii'])

100%|██████████| 986/986 [13:50<00:00,  1.19it/s]


In [15]:
prefix_url = 'https://www.sciencedirect.com/science/article/pii/'
df['Pii'] = piis
df = df[df['Pii'] != 'not found']
print(len(df))

983


In [16]:
df['Pii'] = [prefix_url + pii for pii in df['Pii'].tolist()]
df.to_excel('../data/Elsevier-LIS/TI/TI_filter.xlsx', index=False)

##### 合并数据上的处理

In [32]:
# 合并所有数据
import os

df_IJIM = pd.read_csv('../data/Elsevier-LIS/csv/scopus-ijim.csv')
df_IPM = pd.read_csv('../data/Elsevier-LIS/csv/scopus-IPM.csv')
df_GIQ = pd.read_csv('../data/Elsevier-LIS/csv/scopus-GIQ.csv')
df_IO = pd.read_csv('../data/Elsevier-LIS/csv/scopus-IO.csv')
df_TP = pd.read_csv('../data/Elsevier-LIS/csv/scopus-TP.csv')
df_list = [df_IJIM, df_IPM, df_GIQ, df_IO, df_TP]

columns = ['Title', 'Year', 'Abstract', 'Author Keywords', "Source title"]
df_list = [df[columns] for df in df_list]
df_list[0].head()

Unnamed: 0,Title,Year,Abstract,Author Keywords,Source title
0,Organizational resilience and digital resource...,2023,Recent events have renewed attention to how or...,COVID-19; Digital resources; Exogenous shock; ...,International Journal of Information Management
1,A new perspective of BDA and information quali...,2023,Although organizational factors related to big...,Balanced scorecard; Big data analytics; Inform...,International Journal of Information Management
2,Social media platforms and social enterprise: ...,2023,A considerable number of contributions at the ...,Bibliometric analysis; Literature review; Soci...,International Journal of Information Management
3,How social media live streams affect online bu...,2023,Social media live streams (SMLSs) are becoming...,E-commerce; Live stream; Social media; Social ...,International Journal of Information Management
4,The effects of trust on behavioral intention a...,2022,As electronic transactions between governments...,Behavioral intention; E-government; E-governme...,International Journal of Information Management


In [33]:
df = pd.concat(df_list, ignore_index=True)
df.to_excel('../data/Elsevier-LIS/excel/all_data.xlsx', index=False)

In [None]:
# 过滤掉没有摘要或没有关键词的数据，并统计关键词数量，减少highlight爬取的数据量
df = pd.read_excel('../data/Elsevier-LIS/excel/all_data.xlsx', sheet_name='2012-2023')
print(len(df))
df = df.dropna()
print(len(df))
df = df[df['Abstract'] != '[No abstract available]']
print(len(df))
df.to_excel('../data/Elsevier-LIS/excel/expe_data.xlsx', index=False)

In [2]:
# 统计论文关键词长度占比
import pandas as pd
df = pd.read_excel('../data/Elsevier-LIS/excel/expe_data.xlsx')
keywords = df['Author Keywords'].tolist()
keyword_lengths = [len(keyword.split(';')) for keyword in keywords]

min = 6
cnt = 0
for length in keyword_lengths:
    if length >= min:
        cnt += 1

print(cnt)
print(cnt/len(keyword_lengths))

# 获取对应长度关键词集合的论文标题
titles = df['Title'].tolist()
journals = df['Source title'].tolist()

sub_titles = []
piis = []

for i, title in enumerate(titles):
    if keyword_lengths[i] >= min:
        sub_titles.append(title)
print(len(sub_titles))
print(sub_titles[:5])

2989
0.33770195458140323
2989
['The effects of trust on behavioral intention and use behavior within e-government contexts', 'Cross-sectional research: A critical perspective, use cases, and recommendations for IS research', 'Digital platforms and transformational entrepreneurship during the COVID-19 crisis', 'Examining the relationship between sociomaterial practices enacted in the organizational use of social media and the emerging role of organizational generativity', 'Pathways to developing information technology-enabled capabilities in born-digital new ventures']


In [10]:
from tqdm import tqdm
import requests

for i, title in enumerate(tqdm(sub_titles[len(piis):])):
    data = {
        'title':"\"" + title + "\"",
        # 'pub':"\""+journals[i]+"\"",   
    }

    response = requests.put(url, data=json.dumps(data), headers=headers)
    if response.status_code != 200:
        piis.append('error')
    elif response.json()['resultsFound'] == 0:
        piis.append('not found')
    else:
        piis.append(response.json()['results'][0]['pii'])

100%|██████████| 912/912 [16:41<00:00,  1.10s/it]


In [16]:
print(len(piis))
prefix_url = 'https://www.sciencedirect.com/science/article/pii/'
new_piis = [prefix_url + pii for pii in piis]

df = df[df['Title'].isin(sub_titles)]
df['Pii'] = new_piis
df.to_excel('../data/Elsevier-LIS/excel/expe_data(Other).xlsx', index=False)

2989


In [None]:
import pandas as pd

df = pd.read_excel('../data/Elsevier-LIS/excel/expe_data(Other).xlsx')
piis = df['Pii'].tolist()

error_nums = 0
not_founds = 0
for pii in piis:
    if 'error' in pii:
        error_nums += 1
    elif 'not found' in pii:
        not_founds += 1

print(error_nums)
print(not_founds)

##### 利用采集器

In [48]:
"""
使用采集器爬取后，对数据进行处理

    1. 将原始数据和爬取数据通过Pii号链接起来
        爬取过程中得到的pii号和原始的有不对应的情况，要把爬取的pii号中的abs子串给替换掉
    2. 去除highlight为空的数据
    3. highlight、keyword要进一步处理，目前是混在一起的
"""
import pandas as pd

df_crawl = pd.read_excel('../data/Elsevier-LIS/TI/TI_highlight.xlsx')
df_meta = pd.read_excel('../data/Elsevier-LIS/TI/TI_filter.xlsx')

piis_crawl = df_crawl['Pii'].tolist()
piis_crawl = [pii.replace('abs/', '') for pii in piis_crawl]
df_crawl['Pii'] = piis_crawl

df = pd.DataFrame()
df = pd.merge(df_meta, df_crawl, on='Pii')

print(len(df_crawl))
print(len(df_meta))
print(len(df))

983
983
983


In [49]:
df = df.dropna()
print(len(df))
df = df.drop_duplicates()
print(len(df))

highlights = df['Highlights'].tolist()
empty_nums = 0
for text in highlights:
    if type(text) == float:
        empty_nums += 1

print(empty_nums)

876
876
0


In [50]:
new_highlights = []
for highlight in highlights:
    new_highlight = highlight.replace('highlights', '')
    items = new_highlight.split("•")[1:]
    highlight = ';'.join(items)
    new_highlights.append(highlight)

df['Highlights'] = new_highlights
df.to_excel('../data/Elsevier-LIS/TI/TI_expe.xlsx', index=False)

In [1]:
# 到此处为止所有数据的处理完成，可以在这个地方进行合并，形成领域数据集，然后再得到对应的keywords集合
import pandas as pd

paths = ['IJIM/IJIM_expe.xlsx', 'JOI/JOI_expe.xlsx', 'LISR/LISR_expe.xlsx','TI/TI_expe.xlsx']
prefix_path = '../data/Elsevier-LIS'

df = pd.DataFrame()
for path in paths:
    df = pd.concat([df, pd.read_excel(prefix_path + '/' + path)], ignore_index=True)
    print(len(df))

df = df.dropna()
df = df.drop_duplicates()
print(len(df))
df.head()

881
1675
1768
2644
2640


Unnamed: 0,Title,Year,Abstract,Author Keywords,Pii,Highlights
0,Organizational resilience and digital resource...,2023,Recent events have renewed attention to how or...,COVID-19; Digital resources; Exogenous shock; ...,https://www.sciencedirect.com/science/article/...,Organizational resilience is increasingly impo...
1,A new perspective of BDA and information quali...,2023,Although organizational factors related to big...,Balanced scorecard; Big data analytics; Inform...,https://www.sciencedirect.com/science/article/...,Information quality as a new theoretical lens ...
2,Social media platforms and social enterprise: ...,2023,A considerable number of contributions at the ...,Bibliometric analysis; Literature review; Soci...,https://www.sciencedirect.com/science/article/...,Undertakes a two-pronged bibliometric analysis...
3,How social media live streams affect online bu...,2023,Social media live streams (SMLSs) are becoming...,E-commerce; Live stream; Social media; Social ...,https://www.sciencedirect.com/science/article/...,People use SMLSs to satisfy their need for val...
4,The effects of trust on behavioral intention a...,2022,As electronic transactions between governments...,Behavioral intention; E-government; E-governme...,https://www.sciencedirect.com/science/article/...,Performance and effort expectancy impact e-gov...


In [3]:
# JOI之前有部分文章的关键词数量在6个以上，此处再进行一次过滤
keywords = df['Author Keywords'].tolist()
keyword_lengths = [len(keyword.split(';')) for keyword in keywords]

min = 3
max = 6
cnt = 0
for length in keyword_lengths:
    if length >= min and length <=max:
        cnt += 1

print(cnt)
print(cnt/len(keyword_lengths))

# 获取对应长度关键词集合的论文标题
titles = df['Title'].tolist()
sub_titles = []

for i, title in enumerate(titles):
    if keyword_lengths[i] >= min and keyword_lengths[i] <=max:
        sub_titles.append(title)

df = df[df['Title'].isin(sub_titles)]

2589
1.0


2589

In [5]:
import json
keywords = df['Author Keywords'].tolist()
piis = df['Pii'].tolist()

pii_to_words = {}
for i, pii in enumerate(piis):
    pii_to_words[pii] = []
    items = keywords[i].split(';')
    items = [item.strip() for item in items]
    pii_to_words[pii].extend(items)

with open('../data/Elsevier-LIS/Keywords.json', 'w') as f:
    json.dump(pii_to_words, f)

df.to_excel('../data/Elsevier-LIS/Texts.xlsx', index=False)

##### 利用python框架爬取页面内容

话说python 和 java爬虫比起来，会不会有哪个更容易被封的问题存在

In [1]:
urls = []

with open('../data/Elsevier-AI/urls.txt', 'r') as f:
    for line in f.readlines():
        urls.append(line[:-1])

urls[:5]

['https://www.sciencedirect.com/science/article/pii/S0950705124001990',
 'https://www.sciencedirect.com/science/article/pii/S0950705124001783',
 'https://www.sciencedirect.com/science/article/pii/S0950705124001771',
 'https://www.sciencedirect.com/science/article/pii/S0950705124001989',
 'https://www.sciencedirect.com/science/article/pii/S0950705124001655']

In [4]:
# 自带requests反爬
# fake_useragent: https://www.cnblogs.com/Neeo/articles/11525001.html
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup

In [3]:
response = requests.get(urls[0])
response.status_code

403

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
soup

In [None]:
# scrapy框架反爬

In [None]:
# selenium框架反爬
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome()

driver.get('https://www.sciencedirect.com/science/article/pii/S1751157723000317')

wait = WebDriverWait(driver, 10)
element = wait.until(EC.visibility_of_element_located((By.ID, 'some-id')))

page_source = driver.page_source

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)
# ... Automate something here
driver.quit()