In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx

对所有原始大表进行如下处理：
- 选用PMID 和qualifier两列
- 删去所有空行
- 使用;拆分一行为多行，使得一行仅有唯一的一个qualifier
- 使用:拆分一列为多列，使得是否为主要主题、词ID、词名称分开

Process:
- use cols: PMID, qualifier
- delete blank lines
- use ;  to split into multiple lines, so that there is only one qualifier in each line
- use : to split into multiple columns, so that meta data of qualifiers are separated

In [2]:
df = pd.read_csv("../PKG23_A06_MeshHeadingList.csv", usecols=["PMID","QualifierName"])
df.dropna(inplace=True)
df["QualifierName"] = df["QualifierName"].str.split(";")
df = df.explode("QualifierName")
df[['isMajorTopic', 'QualifierID', "QualifierName"]] = df['QualifierName'].str.split(':', expand=True)
df


Unnamed: 0,PMID,QualifierName,isMajorTopic,QualifierID
0,1,metabolism,N,Q000378
2,1,analysis,Y,Q000032
3,1,blood,N,Q000097
4,1,blood,N,Q000097
4,1,poisoning,Y,Q000506
...,...,...,...,...
314390834,36472364,genetics,N,Q000235
314390840,36472365,diagnosis,N,Q000175
314390843,36472366,diagnosis,N,Q000175
314390845,36472366,diagnosis,N,Q000175


根据原始大表获取comment表，进行如下处理
- 读取所有被评论文章的PMID
- 根据PMID进行左连接合并

Get commented papers from the overall set:
- read all commented PMID
- left join

In [3]:
commented = pd.read_csv("../2_PubMed_comted_pmid_220916.csv")
df_commented = pd.merge(left=commented, right=df, left_on="PMID_art", right_on="PMID", how="left")
df_commented

Unnamed: 0,PMID_art,PMID,QualifierName,isMajorTopic,QualifierID
0,30543192,30543192.0,diagnosis,Y,Q000175
1,30543192,30543192.0,pathology,N,Q000473
2,30543192,30543192.0,methods,N,Q000379
3,24051115,24051115.0,physiopathology,Y,Q000503
4,24051115,24051115.0,physiopathology,Y,Q000503
...,...,...,...,...,...
5349065,10823302,10823302.0,therapeutic use,N,Q000627
5349066,10823302,10823302.0,diagnosis,Y,Q000175
5349067,10823302,10823302.0,drug therapy,N,Q000188
5349068,10823302,10823302.0,genetics,N,Q000235


构建原始网络的函数
- 空网络
- 根据词ID的Unique添加节点，以ID为键，添加属性名称
- 根据df groupby PMID后的结果，添加边
- 保存为gml

construct the network
- blank
- create nodes according to QualifierID, then add attributes
- link edges according to the group by results (Qualidier occurrence in one paper)
- save as gml file

In [7]:
def generate_gml(g_name, df):
    res = {} 
    for name, grouped in df.groupby("PMID"):
        res[name] = grouped["QualifierName"].tolist()
    G = nx.Graph()
    for n in df.QualifierName.unique():
        G.add_node(n)
    for one in res.values():
        if len(one) == 1:
            continue
        for i in range(len(one)):
            for j in range(i+1, len(one)):
                if G.has_edge(one[i], one[j]):
                    G[one[i]][one[j]]["weight"] += 1
                else:
                    G.add_edge(one[i], one[j], weight=1)
    nx.write_gml(G,f"../{g_name}.gml")
    return(G)

In [8]:
G = generate_gml("all_qualifier_major", df[df["isMajorTopic"]=="Y"])
G_commented = generate_gml("commented_qualifier_major", df_commented[df_commented["isMajorTopic"]=="Y"])