In [2]:
from lxml import etree

tree = etree.parse('../../data/mesh/desc2024/desc2024.xml')
root = tree.getroot()
len(root)

30764

In [3]:
def prettyprint(element, **kwargs):
    xml = etree.tostring(element, pretty_print=True, **kwargs)
    print(xml.decode(), end="")

def prettyparse(element):
    result = {}
    for child in element:
        result[child.tag] = child
    return result

##### 1.1 test

In [4]:
desc = prettyparse(root[0])
desc_ui = desc['DescriptorUI'].text
desc_text = desc['DescriptorName'][0].text
conceptlist = desc['ConceptList']

print("{} {}".format(desc_ui, desc_text))

D000001 Calcimycin


In [14]:
concept = prettyparse(conceptlist[0])
conc_ui = concept['ConceptUI'].text
conc_text = concept['ConceptName'][0].text
termlist = concept['TermList']

print("{} {}".format(conc_ui, conc_text))

M0000001 Calcimycin


In [15]:
items = prettyparse(termlist[0])
term = items['String'].text
term

'Calcimycin'

##### 1.2 batch process

In [27]:
import json

def parse_desc(descriptor):
    mesh_dict = {}

    # first step
    desc = prettyparse(descriptor)
    # mesh_dict['desc_ui'] = desc['DescriptorUI'].text
    unique_ui = desc['DescriptorUI'].text
    mesh_dict['desc_text'] = desc['DescriptorName'][0].text
    conceptlist = desc['ConceptList']

    # second step
    try:
        treenumberlist = desc['TreeNumberList']
        mesh_dict['tree_number'] = []
        for treenumber in treenumberlist:
            tree_number = treenumber.text
            mesh_dict['tree_number'].append(tree_number)
    except:
        pass

    # third step
    mesh_dict['concept'] = []
    
    for concept in conceptlist:
        concept_item = {}
        concept = prettyparse(concept)
        concept_item['conc_ui'] = concept['ConceptUI'].text
        concept_item['conc_text'] = concept['ConceptName'][0].text
        # concept_item['ScopeNote'] = concept['ScopeNote'].text
        termlist = concept['TermList']
        
        terms = []
        for term in termlist:
            term = prettyparse(term)
            term_text = term['String'].text
            terms.append(term_text)
        concept_item['term'] = terms

        mesh_dict['concept'].append(concept_item)
    
    return mesh_dict, unique_ui


In [28]:
from tqdm import tqdm

meshs_tree = {}
for idx, desc in enumerate(tqdm(root)):
    mesh_dict, unique_ui = parse_desc(desc)
    meshs_tree[unique_ui] = mesh_dict

meshs_tree = json.dumps(meshs_tree, indent=4)

with open("../data/mesh/desc_simple.json", "w") as f:
    f.write(meshs_tree)

100%|██████████| 30764/30764 [00:00<00:00, 34679.26it/s]


##### 1.3 check data

In [2]:
import json

with open("../data/mesh/desc_simple.json", "r") as f:
    meshs_tree = json.load(f)

desc_lists = list(meshs_tree.keys())
desc_texts = [meshs_tree[desc_list]['desc_text'] for desc_list in desc_lists]
text_to_ui = {text: ui for ui, text in zip(desc_lists, desc_texts)}

In [3]:
# 检查与mtrees的联系

tree_node_names = []
with open("../data/mesh/mtrees2024.bin", "r") as f:
    for line in f.readlines():
        tree_node_names.append(line.split(";")[0])
        
print(len(tree_node_names))
print(len(set(tree_node_names)))
print(len(desc_lists))

64457
30762
30764


In [5]:
"""
desc是根节点的名称，在根节点下第一层叶子节点为concept, 再往下到term
如果是仅匹配论文中的mesh词， 不考虑逻辑关系， 可以把各层次的词拿出来
"""

concept_texts = []
term_texts = []
for ui in desc_lists:
    concepts = meshs_tree[ui]['concept']
    for concept in concepts:
        concept_texts.append(concept['conc_text'])
        term_texts.extend(concept['term'])

In [6]:
print(len(concept_texts))
print(len(set(concept_texts)))

print(len(term_texts))
print(len(set(term_texts)))

61048
61048
263936
263931


##### 1.4 拿到plos正文，匹配mesh词

In [6]:
import json

with open("../../data/mesh/desc_simple.json", "r") as f:
    meshs_tree = json.load(f)

desc_lists = list(meshs_tree.keys())

In [26]:
term_dicts = []
concept_texts = []
term_texts = []

for ui in desc_lists:
    concepts = meshs_tree[ui]['concept']
    for concept in concepts:
        concept_texts.append(concept['conc_text'])
        term_texts.extend(concept['term'])
        term_dict = {}
        for term in concept['term']:
            term_dict['term'] = {
                'name': term,
                'tree':{
                    'concept': concept['conc_text'],
                    'desc': meshs_tree[ui]['desc_text']
                }
            }
            term_dicts.append(term_dict)

In [27]:
with open("../../data/mesh/desc_simple_transfer.json", "w") as f:
    json.dump(term_dicts, f, ensure_ascii=False)

In [16]:
desc_lists = set(desc_lists)
concept_texts = set(concept_texts)
term_texts = set(term_texts)

In [17]:
path = "G:\\Dataset\\PLOS\\allofplos\\journal.pbio.0000001.xml"

tree = etree.parse(path)
root = tree.getroot()

In [20]:
p_list = root.xpath("//body//p")

matches = []
for p in p_list:
    p_content = etree.tostring(p).decode('utf-8')
    paper_words = set(p_content.split())
    found_mesh_words = paper_words.intersection(term_texts)
    if len(found_mesh_words) > 0:
        matches.extend(found_mesh_words)

##### 1.5 统计高频Mesh词