In [1]:
import os
import pandas as pd
import numpy as np

from copy import deepcopy

from itertools import combinations

In [2]:
# === Paths ===
current_path = os.getcwd()
home_path = os.path.dirname(current_path)

In [3]:
db_file = f"{home_path}/data/0-raw_data/AResDB.csv"
kg_file = f"{home_path}/data/0-raw_data/AResKG_1117.txt"

In [4]:
database = pd.read_csv(f"{db_file}", sep=',')
kg = pd.read_csv(f"{kg_file}", sep='\t')

### Step 1: 梳理 top5 species - gene 之间对应的数量关系
____

In [5]:
strains = ['Staphylococcus aureus', 'Escherichia coli', 'Pseudomonas aeruginosa', 'Mycobacterium tuberculosis', 'Klebsiella pneumoniae']
strain_gene_dict = {}

In [6]:
for strain in strains:
    kg_filtered = kg[
        (kg['Predicate'] == 'has gene') &
        (kg['Subject'].str.contains(strain, na=False))
    ]
    
    strain_gene_dict[strain] = sorted(kg_filtered['Object'].unique())

In [7]:
# 显示每个 strain 的基因数量
for strain, genes in strain_gene_dict.items():
    print(strain, len(genes))

Staphylococcus aureus 38
Escherichia coli 4607
Pseudomonas aeruginosa 52
Mycobacterium tuberculosis 4
Klebsiella pneumoniae 78


In [8]:
# 每对物种的交集基因数
pairwise_intersections = {
    f"{a} ∩ {b}": sorted(set(strain_gene_dict[a]) & set(strain_gene_dict[b]))
    for a, b in combinations(strains, 2)
}

n = len(strains)
intersection_matrix = pd.DataFrame(0, index=strains, columns=strains)

for a, b in combinations(strains, 2):
    inter = len(set(strain_gene_dict[a]) & set(strain_gene_dict[b]))
    intersection_matrix.loc[a, b] = inter
    intersection_matrix.loc[b, a] = inter

In [9]:
intersection_matrix

Unnamed: 0,Staphylococcus aureus,Escherichia coli,Pseudomonas aeruginosa,Mycobacterium tuberculosis,Klebsiella pneumoniae
Staphylococcus aureus,0,5,1,3,0
Escherichia coli,5,0,10,3,45
Pseudomonas aeruginosa,1,10,0,1,10
Mycobacterium tuberculosis,3,3,1,0,0
Klebsiella pneumoniae,0,45,10,0,0


In [10]:
### Step 2: gene–gene 关系的数据，并且给出如何与前面的 species → gene list 结合，得到每个物种内部的 gene–gene 关联网络 或 两个物种之间共享基因的关联网络。

In [11]:
gene_gene_preds = ['activates', 'no activates', 'regulates', 'no regulates']

# 所有 gene-gene 关系
gene_edges = kg[kg['Predicate'].isin(gene_gene_preds)].copy()

# 1）每个菌株对应的 gene-gene 边
strain_gene_edges = {}      # strain -> gene-gene 边（DataFrame）
strain_edge_genes = {}      # strain -> 出现在这些边里的所有基因集合
strain_ggi_gene_dict = {}   # strain -> 原始基因 ∪ GGI 中出现的基因（去重后）

for strain, genes in strain_gene_dict.items():
    base_set = set(genes)

    # 任一端在该菌株 gene 集合中即可
    mask = gene_edges['Subject'].isin(base_set) | gene_edges['Object'].isin(base_set)
    df = gene_edges[mask].copy()
    strain_gene_edges[strain] = df

    # 这些边里出现的所有基因（Subject + Object）
    genes_in_edges = set(df['Subject']) | set(df['Object'])
    strain_edge_genes[strain] = genes_in_edges

    # 原始基因 ∪ GGI 关系中出现的基因
    merged_genes = base_set | genes_in_edges
    strain_ggi_gene_dict[strain] = sorted(merged_genes)

# 显示每个菌株在 GGI 关系中涉及的基因数量
print("-- ggi interaction genes (from edges only) --")
for strain, genes_in_edges in strain_edge_genes.items():
    print(strain, len(genes_in_edges))

print("-- after add the ggi interaction genes (base genes ∪ edges genes) --")
for strain, genes in strain_ggi_gene_dict.items():
    print(strain, len(genes))


-- ggi interaction genes (from edges only) --
Staphylococcus aureus 27
Escherichia coli 3935
Pseudomonas aeruginosa 3
Mycobacterium tuberculosis 2
Klebsiella pneumoniae 0
-- after add the ggi interaction genes (base genes ∪ edges genes) --
Staphylococcus aureus 57
Escherichia coli 4723
Pseudomonas aeruginosa 53
Mycobacterium tuberculosis 5
Klebsiella pneumoniae 78


In [12]:
# 每对物种的交集基因数
pairwise_intersections = {
    f"{a} ∩ {b}": sorted(set(strain_ggi_gene_dict[a]) & set(strain_ggi_gene_dict[b]))
    for a, b in combinations(strains, 2)
}

n = len(strains)
intersection_matrix = pd.DataFrame(0, index=strains, columns=strains)

for a, b in combinations(strains, 2):
    inter = len(set(strain_ggi_gene_dict[a]) & set(strain_ggi_gene_dict[b]))
    intersection_matrix.loc[a, b] = inter
    intersection_matrix.loc[b, a] = inter

intersection_matrix

Unnamed: 0,Staphylococcus aureus,Escherichia coli,Pseudomonas aeruginosa,Mycobacterium tuberculosis,Klebsiella pneumoniae
Staphylococcus aureus,0,24,1,4,0
Escherichia coli,24,0,11,4,45
Pseudomonas aeruginosa,1,11,0,1,10
Mycobacterium tuberculosis,4,4,1,0,0
Klebsiella pneumoniae,0,45,10,0,0


In [13]:
### Step 3：根据每个菌株的 gene list，找出这些基因对应的 GO 注释（Molecular Function / Biological Process / Cellular Component），并按菌株保存成字典。

In [14]:
go_preds = {
    'is part of': 'Cellular Component',
    'has': 'Molecular Function',
    'is involved in': 'Biological Process'
}
gene_go_edges = kg[kg['Predicate'].isin(go_preds.keys())].copy()
gene_go_edges['GO_Type'] = gene_go_edges['Predicate'].map(go_preds)

In [15]:
# 对每个菌株，找到其基因的 GO 注释
strain_go_dict = {}   # 最终输出的字典

for strain, gene_list in strain_ggi_gene_dict.items():
    gset = set(gene_list)

    # 取出属于该菌株基因的 GO 注释
    df = gene_go_edges[gene_go_edges['Subject'].isin(gset)]

    # 按 GO 类型分组
    go_by_type = {
        go_type: set(df[df['GO_Type'] == go_type]['Object'])
        for go_type in go_preds.values()
    }

    strain_go_dict[strain] = go_by_type


In [16]:
for strain, go_types in strain_go_dict.items():
    print(f"-- {strain} --")
    for go_type, gos in go_types.items():
        print(f"{go_type}: {len(gos)} GO terms")

-- Staphylococcus aureus --
Cellular Component: 10 GO terms
Molecular Function: 52 GO terms
Biological Process: 35 GO terms
-- Escherichia coli --
Cellular Component: 161 GO terms
Molecular Function: 1805 GO terms
Biological Process: 1536 GO terms
-- Pseudomonas aeruginosa --
Cellular Component: 6 GO terms
Molecular Function: 19 GO terms
Biological Process: 13 GO terms
-- Mycobacterium tuberculosis --
Cellular Component: 7 GO terms
Molecular Function: 15 GO terms
Biological Process: 5 GO terms
-- Klebsiella pneumoniae --
Cellular Component: 4 GO terms
Molecular Function: 15 GO terms
Biological Process: 10 GO terms


In [17]:
## 计算两两菌株之间的 GO 注释交集
# 每个菌株的所有 GO terms（合并三种类型）
strain_all_go = {
    strain: set().union(*go_types_dict.values())
    for strain, go_types_dict in strain_go_dict.items()
}

strains = list(strain_go_dict.keys())
n = len(strains)

go_intersection_matrix = pd.DataFrame(0, index=strains, columns=strains)

for a, b in combinations(strains, 2):
    inter = len(strain_all_go[a] & strain_all_go[b])
    go_intersection_matrix.loc[a, b] = inter
    go_intersection_matrix.loc[b, a] = inter

go_intersection_matrix


Unnamed: 0,Staphylococcus aureus,Escherichia coli,Pseudomonas aeruginosa,Mycobacterium tuberculosis,Klebsiella pneumoniae
Staphylococcus aureus,0,81,22,27,9
Escherichia coli,81,0,36,27,28
Pseudomonas aeruginosa,22,36,0,17,18
Mycobacterium tuberculosis,27,27,17,0,1
Klebsiella pneumoniae,9,28,18,1,0


In [18]:
go_types = list(go_preds.values())  # ['Molecular Function', 'Biological Process', 'Cellular Component']

go_intersection_matrices = {}

for go_type in go_types:
    mat = pd.DataFrame(0, index=strains, columns=strains)
    
    for a, b in combinations(strains, 2):
        inter = len(strain_go_dict[a][go_type] & strain_go_dict[b][go_type])
        mat.loc[a, b] = inter
        mat.loc[b, a] = inter

    go_intersection_matrices[go_type] = mat


In [19]:
go_intersection_matrices['Molecular Function']

Unnamed: 0,Staphylococcus aureus,Escherichia coli,Pseudomonas aeruginosa,Mycobacterium tuberculosis,Klebsiella pneumoniae
Staphylococcus aureus,0,41,12,15,5
Escherichia coli,41,0,17,15,14
Pseudomonas aeruginosa,12,17,0,10,9
Mycobacterium tuberculosis,15,15,10,0,1
Klebsiella pneumoniae,5,14,9,1,0


In [20]:
go_intersection_matrices['Biological Process']

Unnamed: 0,Staphylococcus aureus,Escherichia coli,Pseudomonas aeruginosa,Mycobacterium tuberculosis,Klebsiella pneumoniae
Staphylococcus aureus,0,30,5,5,2
Escherichia coli,30,0,13,5,10
Pseudomonas aeruginosa,5,13,0,4,6
Mycobacterium tuberculosis,5,5,4,0,0
Klebsiella pneumoniae,2,10,6,0,0


In [21]:
go_intersection_matrices['Cellular Component']

Unnamed: 0,Staphylococcus aureus,Escherichia coli,Pseudomonas aeruginosa,Mycobacterium tuberculosis,Klebsiella pneumoniae
Staphylococcus aureus,0,10,5,7,2
Escherichia coli,10,0,6,7,4
Pseudomonas aeruginosa,5,6,0,3,3
Mycobacterium tuberculosis,7,7,3,0,0
Klebsiella pneumoniae,2,4,3,0,0


In [22]:
### step 4: 统计每个菌株中的所有基因对应的抗生素列表

In [23]:
gene_abx_edges = kg[kg['Predicate'] == 'confers resistance to'].copy()

In [24]:
strain_antibiotic_dict = {}
strain_gene_to_abx = {}

for strain, gene_list in strain_ggi_gene_dict.items():
    gset = set(gene_list)

    # 这个菌株所有基因的 ABX 关系
    df = gene_abx_edges[gene_abx_edges['Subject'].isin(gset)]

    # 该菌株所有抗生素集合
    abx_set = set(df['Object'])
    strain_antibiotic_dict[strain] = abx_set

    # 每个基因对应的抗生素（更细粒度）
    gene_to_abx = (
        df.groupby('Subject')['Object']
        .apply(set)
        .to_dict()
    )
    strain_gene_to_abx[strain] = gene_to_abx


In [25]:
print("=== Antibiotics per strain ===")
for strain, abx_set in strain_antibiotic_dict.items():
    print(f"{strain}: {len(abx_set)} antibiotics")

=== Antibiotics per strain ===
Staphylococcus aureus: 237 antibiotics
Escherichia coli: 299 antibiotics
Pseudomonas aeruginosa: 202 antibiotics
Mycobacterium tuberculosis: 110 antibiotics
Klebsiella pneumoniae: 165 antibiotics


In [26]:
# 两两菌株之间的抗生素交集矩阵

strains = list(strain_antibiotic_dict.keys())

abx_intersection_matrix = pd.DataFrame(0, index=strains, columns=strains)

for a, b in combinations(strains, 2):
    inter = len(strain_antibiotic_dict[a] & strain_antibiotic_dict[b])
    abx_intersection_matrix.loc[a, b] = inter
    abx_intersection_matrix.loc[b, a] = inter

abx_intersection_matrix

Unnamed: 0,Staphylococcus aureus,Escherichia coli,Pseudomonas aeruginosa,Mycobacterium tuberculosis,Klebsiella pneumoniae
Staphylococcus aureus,0,189,110,109,76
Escherichia coli,189,0,193,109,160
Pseudomonas aeruginosa,110,193,0,75,142
Mycobacterium tuberculosis,109,109,75,0,41
Klebsiella pneumoniae,76,160,142,41,0
