#### PLOS

In [1]:
import os

path = "G:\\Dataset\\PLOS\\allofplos"
filenames = os.listdir(path)

print(len(filenames))

362279


##### 1.1 熟悉lxml库

https://lxml.de/tutorial.html

In [6]:
from lxml import etree

def prettyprint(element, **kwargs):
    xml = etree.tostring(element, pretty_print=True, **kwargs)
    print(xml.decode(), end="")

In [7]:
tree = etree.parse(os.path.join(path, filenames[0]))
root = tree.getroot()

##### 1.2 检查所有文件的文件名结构

In [30]:
from tqdm import tqdm

cnt = 1000

section_names = []
num_sections = []
error_structure_files = []

for idx, filename in enumerate(tqdm(filenames)):
    if idx > cnt:
        break
    tree = etree.parse(os.path.join(path, filename))
    root = tree.getroot()

    xml_structure = {}
    for child in root:
        xml_structure[child.tag] = child
    
    body = xml_structure['body']

    num_section = len(body)
    num_sections.append(num_section)
    if num_section > 0:
        for sec in body:
            try:
                title = sec[0]
                section_names.append(title.text)
            except:
                print("The structure of {} is unstandard".format(filename))
                error_structure_files.append(filename)
                break
    else:
        continue

  0%|          | 93/362279 [00:00<06:47, 887.72it/s]

The structure of journal.pbio.0000091.xml is unstandard
The structure of journal.pbio.0000092.xml is unstandard


  0%|          | 1001/362279 [00:09<55:06, 109.28it/s] 


In [36]:
section_name_dict = {}

for name in section_names:
    if name in section_name_dict:
        section_name_dict[name] += 1
    else:
        section_name_dict[name] = 1

section_name_dict = sorted(section_name_dict.items(), key=lambda x: x[1], reverse=True)
section_name_dict[:10]

[(None, 620),
 ('Introduction', 391),
 ('Materials and Methods', 389),
 ('Results', 327),
 ('Discussion', 325),
 ('Supporting Information', 315),
 ('Results/Discussion', 58),
 ('\n        ', 15),
 ('Conclusions', 9),
 ('Results and Discussion', 6)]

##### 1.3 抽取题录信息

In [25]:
# title, abstract, journal, doi

metadatas = []

for idx, filename in enumerate(tqdm(filenames)):
    if idx > 1000:
        break

    tree = etree.parse(os.path.join(path, filename))
    root = tree.getroot()

    metadata = {}

    dois = root.xpath("//article-id[@pub-id-type=\"doi\"]")
    for doi in dois:
        metadata['doi'] = doi.text

    journal_titles = root.xpath("//journal-meta//journal-title")
    metadata['journal'] = ''
    for title in journal_titles:
        metadata['journal'] = title.text

    paper_titles = root.xpath("//article-meta//article-title")
    metadata['title'] = []
    for title in paper_titles:
        metadata['title'].append(title.text)

    abstracts = root.xpath("//article-meta//abstract")
    metadata['abstract'] = []
    for abstract in abstracts:
        metadata['abstract'].append(abstract[0].text)

    references = []
    refer_list = root.xpath("//back//ref")
    for refer in refer_list:
        refer_structure = {}
        id = refer.attrib['id']
        refer_structure['id'] = id
        try:
            refer_structure['title'] = refer.xpath("//ref[@id=\"{}\"]//article-title".format(id))[0].text
        except:
            refer_structure['title'] = ''
        try:
            refer_structure['source'] = refer.xpath("//ref[@id=\"{}\"]//source".format(id))[0].text
        except:
            refer_structure['source'] = ''
        try:
            refer_structure['year'] = refer.xpath("//ref[@id=\"{}\"]//year".format(id))[0].text
        except:
            refer_structure['year'] = ''

        references.append(refer_structure)

    metadata['num_reference'] = len(references)
    metadata['reference'] = references
    
    metadatas.append(metadata)


  0%|          | 1001/362279 [00:10<1:04:28, 93.39it/s]


##### 1.4 抽取参考文献信息

In [8]:
references = []

refer_list = root.xpath("//back//ref")

for refer in refer_list:
    refer_structure = {}
    id = refer.attrib['id']
    refer_structure['id'] = id
    try:
        refer_structure['title'] = refer.xpath("//ref[@id=\"{}\"]//article-title".format(id))[0].text
    except:
        refer_structure['title'] = ''
    try:
        refer_structure['source'] = refer.xpath("//ref[@id=\"{}\"]//source".format(id))[0].text
    except:
        refer_structure['source'] = ''
    try:
        refer_structure['year'] = refer.xpath("//ref[@id=\"{}\"]//year".format(id))[0].text
    except:
        refer_structure['year'] = ''

    references.append(refer_structure)

##### 1.5 识别正文中的参考文献信息

In [9]:
ids = [reference['id'] for reference in references]

In [16]:
sections = []

body = root.xpath('//body')[0]
num_sections = len(body)

if num_sections > 0:
    for sec in body:
        try:
            title = sec[0]
            sec_name = title.text
            refer_lists = []
            p_list = sec.xpath(".//p")
            for p in p_list:
                sec_content = etree.tostring(p).decode('utf-8')
                for refer in ids:
                    if refer in sec_content:
                        refer_lists.append(refer)
            sections.append({
                'name': sec_name,
                'refer': refer_lists
            })
        except:
            print("The structure is unstandard")
            break

In [21]:
refers = []
for sec in sections:
    refers.extend(sec['refer'])

len(set(refers)) == len(ids)

True

##### 1.6 检查遗漏情况

In [2]:
filename_simple = [filename.split('.')[-3]+'.'+filename.split('.')[-2] for filename in filenames]
print(filename_simple[:2])
print(len(filename_simple))
print(len(set(filename_simple)))

['pbio.0000001', 'pbio.0000002']
362279
362279


In [3]:
filename_simple = [filename.split('.')[-2] for filename in filenames]
print(filename_simple[:2])
print(len(filename_simple))
print(len(set(filename_simple)))

['0000001', '0000002']
362279
317888


In [4]:
# 找出存在重合情况的数据，删除后重新爬取

name_counts = {}

for name in filename_simple:
    if name in name_counts:
        name_counts[name] += 1
    else:
        name_counts[name] = 1

name_counts = sorted(name_counts.items(), key=lambda x:x[1], reverse=True)
name_counts[:2]

[('0000002', 8), ('0000032', 8)]

In [5]:
import json
dup_names = {}

for item in name_counts:
    if item[1] == 1:
        break
    else:
        dup_names[item[0]] = item[1]

print(len(list(dup_names.keys())))

# dup_names = json.dumps(dup_names, indent=4)
# with open("duplicate_plos_ids.json", 'w') as f:
#     f.write(dup_names)

25119


In [None]:
# 删除重复的数据

names = list(dup_names.keys())

path = "../data/plos"

print(len(os.listdir(path)))

for name in names:
    os.remove(os.path.join(path, name+'.json'))

print(len(os.listdir(path)))

In [10]:
# 重命名非重复数据
path = "../data/plos"

old_names = filename_simple
new_names = [filename.split('.')[-3]+'.'+filename.split('.')[-2] for filename in filenames]

old_to_new = {old:new for old, new in zip(old_names, new_names)}

for name in old_names:
    old_file = name+".json"
    new_file = old_to_new[name]+".json"

    if os.path.exists(os.path.join(path, old_file)):
        os.rename(os.path.join(path, old_file), os.path.join(path, new_file))

##### 1.7 对数据进行过滤

In [1]:
# 基于参考文献的研究，肯定要将reference字段为空的数据去除掉  （section为空的可能也要去掉）
# 基于mesh词的研究，肯定要将mesh字段为空的数据去除掉

# 分析reference为空的和mesh为空的论文公共子集的大小
import os

files_1 = os.listdir("../../data/plos")
files_2 = os.listdir("../../data/plos_mesh/")

share_files = set(files_1) & set(files_2)
len(share_files)

317193

##### 2. 描述性统计

##### 2.1 统计每一年论文的数量

In [3]:
import json

with open('../../data/plos_pub_year.json', 'r') as f:
    pub_year = json.load(f)

dois = list(pub_year.keys())
pub_year[dois[0]]

{'ppub': '2003', 'epub': '2003'}

In [4]:
len(dois)

362277

In [8]:
with open('../Data/plos_refer_doi_to_name.json') as f:
    doi_to_name = json.load(f)

name_to_doi = {v:k for k, v in doi_to_name.items()}
len(doi_to_name.keys())

330965

In [15]:
year_to_cnt = {}
miss_year_dois = []

cnt = 0
for doi in dois:
    try:
        if doi_to_name[doi] in share_files:
            try:
                year = int(pub_year[doi]['ppub'])
            except:
                try:
                    year = int(pub_year[doi]['epub'])
                except:
                    miss_year_dois.append(year)
                    continue

            if year in year_to_cnt:
                year_to_cnt[year] += 1
            else:
                year_to_cnt[year] = 1
            cnt += 1
    except:
        pass

print(cnt)

317191


##### 2.2 统计出现频次最高的十种期刊

##### 2.3 统计出现频次最高的十种Mesh

##### 2.4 统计章节名情况，归一化篇章名

似乎不需要归一化篇章名，目前只要做到判断某几个参考文献是在同一章节中出现即可，无需确定章节名字