In [1]:
import os
import pandas as pd
import seaborn as sns
import networkx as nx

os.chdir("/Users/blue/code/graph_permutation")

* Oncogene（癌基因）
Oncogene 是一種在某些狀況下可能引起細胞癌變（轉變成癌症細胞）的基因。它們通常參與細胞生長、分裂和生存的調控。突變或異常活化的 oncogene 可能導致細胞不受控制地增殖，促使腫瘤的形成。  
TSG（腫瘤抑制基因）
Tumor Suppressor Genes 是一類抑制腫瘤形成的基因。它們通常參與細胞生命週期的調控，並在發現 DNA 損傷時阻止細胞進行不受控制的分裂。突變或失活的 TSG 可能導致細胞失去對癌症的抵抗力。  
Fusion Gene（基因融合）
基因融合是指兩個不同的基因的 DNA 片段在染色體層面上的結合。這種結合可能導致新的基因形成，其功能和調控可能與原始的基因有所不同。在某些情況下，基因融合被認為與癌症的發展有關，因為它可能改變細胞的正常生物學特性。

In [8]:
PPI_network_path = "raw_data/9606.protein.physical.links.v12.0.txt"
cencer_path = "raw_data/Census_allTue Oct 24 07_30_47 2023 - Census_allTue Oct 24 07_30_47 2023.csv"
mart_path = "raw_data/mart_export.txt"

In [19]:
mart_df = pd.read_csv(mart_path, sep="\t")
mart_df = mart_df.loc[:, ["Gene stable ID", "Protein stable ID"]]
ENSP_to_ENSG = mart_df.dropna(axis=0).set_index("Protein stable ID")
ENSG_to_ENSP = mart_df.dropna(axis=0).set_index("Gene stable ID")


Unnamed: 0,Gene stable ID,Protein stable ID
0,ENSG00000210049,
1,ENSG00000211459,
2,ENSG00000210077,
3,ENSG00000210082,
4,ENSG00000209082,
...,...,...
276945,ENSG00000116786,ENSP00000364956
276946,ENSG00000116786,ENSP00000364950
276947,ENSG00000116786,
276948,ENSG00000116786,ENSP00000494591


In [78]:
cancer_df = pd.read_csv(cencer_path, sep=",")
cancer_df.loc[:, "ENSG"] = cancer_df.Synonyms.str.extract(r'(ENSG\d+)')

result_df = pd.DataFrame()
result_df['ENSG'] = cancer_df['ENSG']
result_df['is_oncogene'] = cancer_df["Role in Cancer"].str.contains('oncogene', case=False, na=False)
result_df['is_fusion'] = cancer_df["Role in Cancer"].str.contains('fusion', case=False, na=False)
result_df['is_TSG'] = cancer_df["Role in Cancer"].str.contains('TSG', case=False, na=False)

merged_df = pd.merge(result_df, mart_df, left_on="ENSG", right_on="Gene stable ID", how='inner')
merged_df = merged_df.drop_duplicates(subset=["Protein stable ID"])
merged_df = merged_df.dropna(subset=["Protein stable ID"])
merged_df = merged_df.set_index("Protein stable ID")
merged_df = merged_df.drop(["ENSG", "Gene stable ID"], axis=1)
merged_df.index.name = "ENSP"
merged_df["is_cancer_protein"] = True
merged_df.to_csv("data/cancer_protein.csv")



In [92]:
PPI_df = pd.read_csv(PPI_network_path, sep=" ")
PPI_df.columns = ["src", "dst", "combined_score"]
PPI_df.src = PPI_df.src.str.replace("9606.", "", regex=False).str.strip()
PPI_df.dst = PPI_df.dst.str.replace("9606.", "", regex=False).str.strip()
# PPI_df = PPI_df[PPI_df.src.isin(ENSP_to_ENSG.index) & PPI_df.dst.isin(ENSP_to_ENSG.index)]
PPI_df = PPI_df.drop("combined_score", axis=1)
# cancer_PPI_df = PPI_df[PPI_df.src.isin(merged_df.index) & PPI_df.dst.isin(merged_df.index)]
PPI_df.to_csv("data/edges.csv")

In [96]:
node_list = list(set(list(PPI_df.src) + list(PPI_df.dst)))
node_df = pd.DataFrame(index=node_list)
node_df = pd.merge(node_df, merged_df, left_index=True, right_index=True, how="left")
node_df = node_df.fillna(False)
node_df.index.name = "node_name"
node_df.to_csv("data/nodes.csv")

In [95]:
node_df


Unnamed: 0,is_oncogene,is_fusion,is_TSG,is_cancer_protein
ENSP00000258457,False,False,False,False
ENSP00000437121,False,False,False,False
ENSP00000367229,False,False,False,False
ENSP00000353770,False,False,False,False
ENSP00000222256,False,False,False,False
...,...,...,...,...
ENSP00000262105,False,False,False,False
ENSP00000355900,False,False,False,False
ENSP00000308622,False,False,False,False
ENSP00000471593,False,False,False,False
