# GO has thier own annotations 

These include not only proteins and Genes, but complexes and microRNA

In [1]:
import pandas as pd
from pathlib import Path
from data_tools.files import head 
from data_tools import graphs as gt
from data_tools.df_processing import expand_col_on_char, combine_group_cols_on_char, regularize_colnames

load_dir = Path('../2_pipeline/00_download_data/out/').resolve()
network_dir = Path('../2_pipeline/04_InterPro_and_Ensembl/out/').resolve()

  from tqdm.autonotebook import tqdm


In [2]:
all_nodes = gt.remove_colons(pd.read_csv(network_dir.joinpath('nodes.csv'), dtype=str))
edges = gt.remove_colons(pd.read_csv(network_dir.joinpath('edges.csv'), dtype=str))

In [3]:
go_ids = set(all_nodes[all_nodes['id'].str.startswith('GO:')]['id'])

In [4]:
new_nodes = []
new_edges = []

In [5]:
def determine_evidence(code):                                                                                           
    """                                                                                                                 
    Computationaly derived codes are taken from GO's webiste:                                                           
    http://geneontology.org/docs/guide-go-evidence-codes/                                                               
                                                                                                                        
    :param code: str, 3 (or 2) letter code for annotation evidence.                                                     
    :retrun: str, 'curated' or 'computed' status of the code.                                                           
    """                                                                                                                 
    comp_codes = ['ISS', 'ISO', 'ISA', 'ISM', 'IGC', 'RCA', 'IEA']                                                      
    return 'computed' if code.upper() in comp_codes else 'curated'

## Start loading .gaf files.

In [6]:
go_cols = ['db', 'db_object_id', 'db_object_symbol','qualifier', 'go_id', 'db_reference', 'evidence_code',
 'with_or_from', 'aspect', 'db_object_name', 'db_object_synonym', 'db_object_type', 'taxon', 'date', 'assigned_by',
 'annotation_extension', 'gene_product_form_id']

In [7]:
go_prot = pd.read_csv(load_dir.joinpath('goa_human.gaf.gz'), sep='\t', header=None, 
                      names=go_cols, comment='!', dtype=str)
go_prot.head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0003924,GO_REF:0000002,IEA,InterPro:IPR001770,F,Guanine nucleotide-binding protein subunit gamma,DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20190504,InterPro,,
1,UniProtKB,A0A024RBG1,NUDT4B,,GO:0003723,GO_REF:0000037,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20190504,UniProt,,


In [8]:
go_prot['db'].value_counts()

UniProtKB    479438
Name: db, dtype: int64

In [9]:
go_complex = pd.read_csv(load_dir.joinpath('goa_human_complex.gaf.gz'), sep='\t', header=None, 
                      names=go_cols, dtype=str, comment='!')
go_complex.head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,ComplexPortal,CPX-1012,tenascin-w_human,,GO:0030155,PMID:17909022,IDA,,P,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,
1,ComplexPortal,CPX-1012,tenascin-w_human,,GO:0030334,PMID:17909022,IDA,,P,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,


In [10]:
go_complex.query('aspect == "C"').head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
2,ComplexPortal,CPX-1012,tenascin-w_human,,GO:0062023,PMID:19884327,IDA,,C,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,
13,ComplexPortal,CPX-1032,snurportin_human,,GO:0005654,PMID:9670026,IDA,,C,"Importin complex, Snurportin variant","Snurportin complex|Importin complex, SNUPN var...",protein_complex,taxon:9606,20170608,ComplexPortal,,


In [11]:
go_complex['db'].value_counts()

ComplexPortal    1470
Name: db, dtype: int64

In [12]:
go_isoform = pd.read_csv(load_dir.joinpath('goa_human_isoform.gaf.gz'), sep='\t', header=None, 
                      names=go_cols, dtype=str, comment='!')
go_isoform.head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,UniProtKB,A0A087WTH5,KCNE1B,,GO:0005249,GO_REF:0000002,IEA,InterPro:IPR000369|InterPro:IPR005424,F,Potassium voltage-gated channel subfamily E me...,KCNE1B,protein,taxon:9606,20190504,InterPro,,UniProtKB:A0A087WU88
1,UniProtKB,A0A087WTH5,KCNE1B,,GO:0005249,GO_REF:0000002,IEA,InterPro:IPR000369|InterPro:IPR005424,F,Potassium voltage-gated channel subfamily E me...,KCNE1B,protein,taxon:9606,20190504,InterPro,,UniProtKB:A0A087WWU3


In [13]:
go_isoform['db'].value_counts()

UniProtKB    101596
Name: db, dtype: int64

In [14]:
go_rna = pd.read_csv(load_dir.joinpath('goa_human_rna.gaf.gz'), sep='\t', header=None, 
                      names=go_cols, dtype=str, comment='!')
go_rna.head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,RNAcentral,URS0000001346_9606,URS0000001346_9606,,GO:0006412,GO_REF:0000108,IEA,GO:0030533,P,Homo sapiens (human) tRNA-Lys,,tRNA,taxon:9606,20190504,GOC,,
1,RNAcentral,URS0000001346_9606,URS0000001346_9606,,GO:0030533,GO_REF:0000115,IEA,Rfam:RF00005,F,Homo sapiens (human) tRNA-Lys,,tRNA,taxon:9606,20190504,RNAcentral,,


In [15]:
go_rna['db'].value_counts()

RNAcentral    43498
Name: db, dtype: int64

## Add GO to protein annotiaions....

Will also be able to add complex and rna info

In [16]:
type_conversion = {'P': 'involved_in',
                   'F': 'enables',
                   'C': 'part_of'}

edges_go_prot = go_prot.rename(columns={'db_object_symbol': 'start_id', 'go_id': 'end_id'})
edges_go_prot['type'] = edges_go_prot['aspect'].map(type_conversion)

In [17]:
def fix_col_order_edge(df):
    cols = ['start_id', 'end_id', 'type']
    cols = cols + [c for c in df.columns if c not in cols]
    return cols


In [18]:
edges_go_prot = edges_go_prot[fix_col_order_edge(edges_go_prot)]
edges_go_prot.head(2)

Unnamed: 0,start_id,end_id,type,db,db_object_id,qualifier,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,DNAJC25-GNG10,GO:0003924,enables,UniProtKB,A0A024R161,,GO_REF:0000002,IEA,InterPro:IPR001770,F,Guanine nucleotide-binding protein subunit gamma,DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20190504,InterPro,,
1,NUDT4B,GO:0003723,enables,UniProtKB,A0A024RBG1,,GO_REF:0000037,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20190504,UniProt,,


Remove relations that have qualifiers that contain NOT

In [19]:
from data_tools.wiki import get_curi_xrefs
id_symbol = get_curi_xrefs(all_nodes, 'SYM')

id_symbol.head(2)

Unnamed: 0,id,xrefs
524731,NCBIGene:1,SYM:A1BG
524739,NCBIGene:10,SYM:NAT2


In [20]:
id_symbol['xrefs'] = id_symbol['xrefs'].apply(lambda s: s.split(':')[-1])

In [21]:
edges_go_prot = edges_go_prot[~edges_go_prot['qualifier'].str.contains('NOT').fillna(False)]

In [22]:
edges_go_prot = (edges_go_prot.merge(id_symbol, how='left', left_on='start_id', right_on='xrefs')
             .drop(['start_id', 'xrefs'], axis=1).rename(columns={'id': 'start_id'}))

In [23]:
print('GO Gene annotations')
print('Total:   {:10,}'.format(len(edges_go_prot)))
print('Mapped:  {:10,}'.format(edges_go_prot.query('end_id in @go_ids')['start_id'].count()))
print('Unmapped:{:10,}'.format(len(edges_go_prot) - edges_go_prot.query('end_id in @go_ids')['start_id'].count()))

GO Gene annotations
Total:      478,227
Mapped:     469,786
Unmapped:     8,441


In [24]:
edges_go_prot['db_object_id'].nunique()

19834

In [25]:
edges_go_prot['start_id'].nunique()

18974

There are fewer Gene IDs than Protein IDs, so there may be 1 to many relationships going on...

In [26]:
print('Number of UniProt IDs that map to more than one Gene ID: {:,}'.format(
    (edges_go_prot[['start_id', 'db_object_id']].groupby('db_object_id')['start_id'].nunique() > 1).sum()))
print('Number of Gene IDs that map to more than 1 UniProt ID: {:,}'.format(
    (edges_go_prot[['start_id', 'db_object_id']].groupby('start_id')['db_object_id'].nunique() > 1).sum()))

Number of UniProt IDs that map to more than one Gene ID: 0
Number of Gene IDs that map to more than 1 UniProt ID: 50


51 Genes that map to multiple Uniprots... it should be fine to map them to all the edges for all the correspoding uniprots

In [27]:
edges_go_prot.head(2)

Unnamed: 0,end_id,type,db,db_object_id,qualifier,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id,start_id
0,GO:0003924,enables,UniProtKB,A0A024R161,,GO_REF:0000002,IEA,InterPro:IPR001770,F,Guanine nucleotide-binding protein subunit gamma,DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20190504,InterPro,,,NCBIGene:552891
1,GO:0003723,enables,UniProtKB,A0A024RBG1,,GO_REF:0000037,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20190504,UniProt,,,


In [28]:
edges_go_prot['source'] = 'Gene Ontology'
edges_go_prot['license'] = 'CC-BY 4.0'

edges_go_prot['dsrc_type'] = edges_go_prot['evidence_code'].apply(determine_evidence)

comp_idx = edges_go_prot.query('dsrc_type == "computed"').index
edges_go_prot.loc[comp_idx, 'comp_type'] = edges_go_prot.loc[comp_idx, 'evidence_code']

In [29]:
edges_go_prot['dsrc_type'].value_counts()

curated     373997
computed    104230
Name: dsrc_type, dtype: int64

In [30]:
# Add all mapped edges
new_edges.append(edges_go_prot.dropna(subset=['start_id', 'end_id']))

### Complex to GO

In [31]:
edges_go_cpx = go_complex.rename(columns={'db_object_id': 'start_id', 'go_id': 'end_id'})
edges_go_cpx['type'] = edges_go_cpx['aspect'].map(type_conversion)
edges_go_cpx = edges_go_cpx[fix_col_order_edge(edges_go_cpx)]
edges_go_cpx.head()

Unnamed: 0,start_id,end_id,type,db,db_object_symbol,qualifier,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,CPX-1012,GO:0030155,involved_in,ComplexPortal,tenascin-w_human,,PMID:17909022,IDA,,P,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,
1,CPX-1012,GO:0030334,involved_in,ComplexPortal,tenascin-w_human,,PMID:17909022,IDA,,P,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,
2,CPX-1012,GO:0062023,part_of,ComplexPortal,tenascin-w_human,,PMID:19884327,IDA,,C,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,
3,CPX-1012,GO:1903672,involved_in,ComplexPortal,tenascin-w_human,,PMID:19884327,IDA,,P,Tenascin-W complex,TNN hexamer|Tenascin W complex,protein_complex,taxon:9606,20170313,ComplexPortal,,
4,CPX-1014,GO:0007160,involved_in,ComplexPortal,tenascin-x_human,,GO_REF:0000108,IEA,GO:0098639,P,Tenascin-X complex,TNXB trimer|Tenascin X complex|TNX complex|TN-...,protein_complex,taxon:9606,20190504,GOC,,


In [32]:
# Add a CURI for complex portal
edges_go_cpx['start_id'] = 'CPX:'+edges_go_cpx['start_id']

In [33]:
edges_go_cpx = edges_go_cpx[~edges_go_cpx['qualifier'].str.contains('NOT').fillna(False)]

In [34]:
edges_go_cpx['source'] = 'Gene Ontology'
edges_go_cpx['license'] = 'CC-BY 4.0'

edges_go_cpx['dsrc_type'] = edges_go_cpx['evidence_code'].apply(determine_evidence)

comp_idx = edges_go_cpx.query('dsrc_type == "computed"').index
edges_go_cpx.loc[comp_idx, 'comp_type'] = edges_go_cpx.loc[comp_idx, 'evidence_code']

In [35]:
# Add all mapped edges
new_edges.append(edges_go_cpx.dropna(subset=['start_id', 'end_id']))

#### Need more to anchor complexes into the network... Gene to Complex, and Complex Identifiers...

In [36]:
cplx = pd.read_csv(load_dir.joinpath('homo_sapiens.tsv'), sep='\t')
cplx.columns = regularize_colnames(cplx.columns)
cplx.head(2)

Unnamed: 0,complex_ac,recommended_name,aliases_for_complex,taxonomy_identifier,identifiers_and_stoichiometry_of_molecules_in_complex,confidence,experimental_evidence,go_annotations,cross_references,description,complex_properties,complex_assembly,ligand,disease,agonist,antagonist,comment,source
0,CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Polybromo-associated SWI/SNF ATP-dependent chr...,9606,Q86U86(0)|Q68CP9(0)|Q8WUB8(0)|P60709(0)|O94805...,ECO:0005547(biological system reconstruction e...,-,GO:0016363(nuclear matrix)|GO:2000045(regulati...,pubmed:11790558(see-also)|pubmed:18809673(see-...,An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"
1,CPX-1201,Neural progenitor-specific SWI/SNF ATP-depende...,neural progenitor-specific BAF ATP-dependent c...,9606,P51531(0)|O14497(0)|Q969G3(0)|Q6STE5(0)|Q8WUB8...,ECO:0005547(biological system reconstruction e...,-,GO:2000045(regulation of G1/S transition of mi...,pubmed:11790558(see-also)|pubmed:18809673(see-...,An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"


In [37]:
# Add curi
cplx['complex_ac'] = 'CPX:' + cplx['complex_ac']

Well, these complexes are nice, but we already have complexs in Reactome space.  We might be able to map them though. Luckily this file contains cross-references

In [38]:
xref_cplx = expand_col_on_char(cplx, 'cross_references', '|')
xref_cplx.head(2)

Unnamed: 0,complex_ac,recommended_name,aliases_for_complex,taxonomy_identifier,identifiers_and_stoichiometry_of_molecules_in_complex,confidence,experimental_evidence,go_annotations,cross_references,description,complex_properties,complex_assembly,ligand,disease,agonist,antagonist,comment,source
0,CPX:CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Polybromo-associated SWI/SNF ATP-dependent chr...,9606,Q86U86(0)|Q68CP9(0)|Q8WUB8(0)|P60709(0)|O94805...,ECO:0005547(biological system reconstruction e...,-,GO:0016363(nuclear matrix)|GO:2000045(regulati...,pubmed:11790558(see-also),An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"
1,CPX:CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Polybromo-associated SWI/SNF ATP-dependent chr...,9606,Q86U86(0)|Q68CP9(0)|Q8WUB8(0)|P60709(0)|O94805...,ECO:0005547(biological system reconstruction e...,-,GO:0016363(nuclear matrix)|GO:2000045(regulati...,pubmed:18809673(see-also),An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"


In [39]:
react_cplx = xref_cplx[xref_cplx['cross_references'].str.startswith('reactome')].copy()
react_cplx[['complex_ac', 'recommended_name', 'cross_references']].head(2)

Unnamed: 0,complex_ac,recommended_name,cross_references
99,CPX:CPX-56,GLI1-SUFU complex,reactome:R-HSA-5610605(identity)
100,CPX:CPX-56,GLI1-SUFU complex,reactome:R-HSA-5610531(identity)


In [40]:
len(react_cplx), react_cplx['cross_references'].str.contains('identity').sum()

(389, 386)

In [41]:
react_cplx[~react_cplx['cross_references'].str.contains('identity')][['complex_ac', 'recommended_name', 'cross_references']]

Unnamed: 0,complex_ac,recommended_name,cross_references
370,CPX:CPX-2108,epsilon DNA polymerase complex,reactome:R-HSA-68483(subset)
2223,CPX:CPX-382,Interleukin-12-receptor complex,reactome:R-HSA-8854487(subset)
2903,CPX:CPX-383,Interleukin-23-receptor complex,reactome:R-HSA-447186(subset)


In [42]:
react_cplx['xref'] = (react_cplx['cross_references']
                         .str.replace('reactome:', 'REACT:', regex=False)
                         .str.replace('(identity)', '', regex=False)
                         .str.replace('(subset)', '', regex=False)
                         .apply(lambda s: s.split('.')[0]))

In [43]:
react_cplx['xref'].nunique(), react_cplx['complex_ac'].nunique()

(364, 272)

In [44]:
react_cplx_map = react_cplx.loc[:, ['complex_ac', 'recommended_name', 'xref']]
react_cplx_map.head(2)

Unnamed: 0,complex_ac,recommended_name,xref
99,CPX:CPX-56,GLI1-SUFU complex,REACT:R-HSA-5610605
100,CPX:CPX-56,GLI1-SUFU complex,REACT:R-HSA-5610531


We will add new nodes not mapable to reactome, then later map edge identifiers to Reactome when possible

We must also add the CPX xrefs to the reactome nodes in the nw

In [45]:
mappable_cplxes = react_cplx_map['complex_ac'].unique()

In [46]:
reactome_complex_ids = react_cplx_map['xref'].unique()

# Query for reactome complexes that are corss-referencable
qr = all_nodes.query('id in @reactome_complex_ids')
# need to be able to map back to the correct index later
qr_idx_map = qr.reset_index().set_index('id')['index'].to_dict()
qr_idx = qr.index

# Merge so none are lost.
mrg_res = qr.merge(react_cplx_map, how='left', left_on='id', right_on='xref')
# Now collapse the one to many relationships on reactome id
comb_res = combine_group_cols_on_char(mrg_res, ['id'], ['complex_ac'], sort=True, prog=False)

In [47]:
# Map back to the original index
comb_res['idx'] = comb_res['id'].map(qr_idx_map)
# Set the values
all_nodes.loc[comb_res['idx'], 'xrefs'] = comb_res['complex_ac'].tolist()

all_nodes.loc[comb_res['idx']].head(2)

Unnamed: 0,id,name,label,xrefs
40124,REACT:R-HSA-109629,TFIIA,Complex,CPX:CPX-519
40126,REACT:R-HSA-109631,TFIIF,Complex,CPX:CPX-79


Now that we've added the X-refs for those nodes that are mappable, lets convert those that are not mappable to new nodes

In [48]:
cpx_nodes = cplx.rename(columns={'complex_ac': 'id', 'recommended_name': 'name'})
cpx_nodes = cpx_nodes.query('id not in @mappable_cplxes')

cpx_nodes['label'] = 'Complex'
cpx_nodes[['id', 'name', 'label']].head()

Unnamed: 0,id,name,label
0,CPX:CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Complex
1,CPX:CPX-1201,Neural progenitor-specific SWI/SNF ATP-depende...,Complex
2,CPX:CPX-1194,Muscle cell-specific SWI/SNF ATP-dependent chr...,Complex
3,CPX:CPX-1282,Laminin211-nidogen complex,Complex
4,CPX:CPX-1285,Laminin221-nidogen complex,Complex


In [49]:
new_nodes.append(cpx_nodes[['id', 'name', 'label']])

In [50]:
cpx_exp = expand_col_on_char(cplx, 'identifiers_and_stoichiometry_of_molecules_in_complex', '|')
cpx_exp.head(2)

Unnamed: 0,complex_ac,recommended_name,aliases_for_complex,taxonomy_identifier,identifiers_and_stoichiometry_of_molecules_in_complex,confidence,experimental_evidence,go_annotations,cross_references,description,complex_properties,complex_assembly,ligand,disease,agonist,antagonist,comment,source
0,CPX:CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Polybromo-associated SWI/SNF ATP-dependent chr...,9606,Q86U86(0),ECO:0005547(biological system reconstruction e...,-,GO:0016363(nuclear matrix)|GO:2000045(regulati...,pubmed:11790558(see-also)|pubmed:18809673(see-...,An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"
1,CPX:CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Polybromo-associated SWI/SNF ATP-dependent chr...,9606,Q68CP9(0),ECO:0005547(biological system reconstruction e...,-,GO:0016363(nuclear matrix)|GO:2000045(regulati...,pubmed:11790558(see-also)|pubmed:18809673(see-...,An ATP-dependent chromatin remodeling complex ...,MW approximately 2 MDa. May contain 2 instance...,-,-,Coffin-Siris syndrome (CSS) [Orphanet:1465]: a...,-,-,-,"psi-mi:""MI:0469""(IntAct)"


In [51]:
cpx_exp['uniprot_id'] = cpx_exp['identifiers_and_stoichiometry_of_molecules_in_complex'].str.split('(', expand=True)[0]

In [52]:
gene_to_uniprot = get_curi_xrefs(all_nodes, 'UniProt')
gene_to_uniprot.head(2)

Unnamed: 0,id,xrefs
525048,NCBIGene:1,UniProt:P04217
525056,NCBIGene:10,UniProt:P11245


In [53]:
gene_to_uniprot['xrefs'] = gene_to_uniprot['xrefs'].apply(lambda s: s.split(':')[-1])

In [54]:
edges_cpx_gene = cpx_exp[['complex_ac', 'uniprot_id']].rename(columns={'complex_ac': 'end_id'})

edges_cpx_gene = (edges_cpx_gene.merge(gene_to_uniprot, left_on='uniprot_id', right_on='xrefs', how='left')
                                .drop(['xrefs', 'uniprot_id'], axis=1).rename(columns={'id': 'start_id'}))

In [55]:
# Map the complexes that we can to Reactome
edges_cpx_gene = edges_cpx_gene.merge(react_cplx_map, how='left', left_on='end_id', right_on='complex_ac')
edges_cpx_gene['end_id'] = edges_cpx_gene['xref'].fillna(edges_cpx_gene['end_id'])
len(edges_cpx_gene)

5496

In [56]:
edges_cpx_gene['type'] = 'part_of'
edges_cpx_gene[['start_id', 'end_id', 'type']].head()

Unnamed: 0,start_id,end_id,type
0,NCBIGene:55193,CPX:CPX-1196,part_of
1,UniProt:Q86U86,CPX:CPX-1196,part_of
2,NCBIGene:196528,CPX:CPX-1196,part_of
3,UniProt:Q68CP9,CPX:CPX-1196,part_of
4,NCBIGene:55274,CPX:CPX-1196,part_of


In [57]:
edges_cpx_gene['source'] = 'ComplexPortal'
edges_cpx_gene['license'] = 'CC-BY 4.0'

edges_cpx_gene['dsrc_type'] = 'curated'

In [58]:
new_edges.append(edges_cpx_gene[['start_id', 'end_id', 'type', 'source', 'dsrc_type']].dropna())

### RNA to GO

In [59]:
go_rna.head(2)

Unnamed: 0,db,db_object_id,db_object_symbol,qualifier,go_id,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,RNAcentral,URS0000001346_9606,URS0000001346_9606,,GO:0006412,GO_REF:0000108,IEA,GO:0030533,P,Homo sapiens (human) tRNA-Lys,,tRNA,taxon:9606,20190504,GOC,,
1,RNAcentral,URS0000001346_9606,URS0000001346_9606,,GO:0030533,GO_REF:0000115,IEA,Rfam:RF00005,F,Homo sapiens (human) tRNA-Lys,,tRNA,taxon:9606,20190504,RNAcentral,,


With Reactome, we had Micro RNA in MIRbase identifiers... These are RNA central IDs, so we have to reconcile that.

However, there are small number of MIRbase ids, so we may stick with the RNACenral identifier

In [60]:
all_nodes.query('label == "Micro RNA"').shape[0]

47

In [61]:
mirbase = pd.read_csv(load_dir.joinpath('mirbase.tsv'), sep='\t', header=None, dtype=str,
            names=['int_id', 'db_name', 'ext_id', 'tax_id', 'mir'], usecols=list(range(5)))
len(mirbase)

87467

In [62]:
mirbase.head(2)

Unnamed: 0,int_id,db_name,ext_id,tax_id,mir
0,URS00000036DB,MIRBASE,MIMAT0048904,2711,miRNA
1,URS00000036DB,MIRBASE,MIMAT0048896,2711,miRNA


In [63]:
mirbase['full_id'] = mirbase['int_id'] + '_' + mirbase['tax_id']
mirbase['ext_id'] = 'MI:' + mirbase['ext_id']

mirbase.query('tax_id == "9606"').head(2)

Unnamed: 0,int_id,db_name,ext_id,tax_id,mir,full_id
50027,URS00000011DF,MIRBASE,MI:MIMAT0022711,9606,miRNA,URS00000011DF_9606
50028,URS0000001A7A,MIRBASE,MI:MI0003639,9606,precursor_RNA,URS0000001A7A_9606


In [64]:
mirb_ids = mirbase['ext_id'].unique()

# Query for reactome complexes that are corss-referencable
qr = all_nodes.query('id in @mirb_ids')
# need to be able to map back to the correct index later
qr_idx_map = qr.reset_index().set_index('id')['index'].to_dict()
qr_idx = qr.index

In [65]:
# Merge so none are lost.
mrg_res = qr.merge(mirbase, how='left', left_on='id', right_on='ext_id')
# Now collapse the one to many relationships on reactome id
comb_res = combine_group_cols_on_char(mrg_res, ['full_id'], ['id'], sort=True, prog=False)

In [66]:
comb_res['idx'] = comb_res['id'].map(qr_idx_map)

In [67]:
comb_res['full_xref'] = comb_res['id']+'|'+comb_res['xrefs']
comb_res['full_id'] = 'RNAC:'+comb_res['full_id']

In [68]:
comb_res[['full_id', 'id']].nunique() # 1-to-1 mapping

full_id    47
id         47
dtype: int64

1-1 mapping means we can use a dict for quick and easy conversion of all the edges

In [69]:
mapped_rnas = comb_res['full_id'].unique()

In [70]:
all_nodes.loc[comb_res['idx'].tolist(), 'xrefs'] = comb_res['full_xref'].tolist()
all_nodes.loc[comb_res['idx'].tolist(), 'id'] = comb_res['full_id'].tolist()

In [71]:
# Map existing Edges
mirna_id_map = comb_res.set_index('id')['full_id'].to_dict()

edges['start_id'] = edges['start_id'].map(mirna_id_map).fillna(edges['start_id'])
edges['end_id'] = edges['end_id'].map(mirna_id_map).fillna(edges['end_id'])

In [72]:
type_conversion = {k: v.replace('X', 'N') for k, v in type_conversion.items()}
edges_go_rna = go_rna.rename(columns={'db_object_id': 'start_id', 'go_id': 'end_id'})
edges_go_rna['type'] = edges_go_rna['aspect'].map(type_conversion)
edges_go_rna = edges_go_rna[fix_col_order_edge(edges_go_rna)]
edges_go_rna.head()

Unnamed: 0,start_id,end_id,type,db,db_object_symbol,qualifier,db_reference,evidence_code,with_or_from,aspect,db_object_name,db_object_synonym,db_object_type,taxon,date,assigned_by,annotation_extension,gene_product_form_id
0,URS0000001346_9606,GO:0006412,involved_in,RNAcentral,URS0000001346_9606,,GO_REF:0000108,IEA,GO:0030533,P,Homo sapiens (human) tRNA-Lys,,tRNA,taxon:9606,20190504,GOC,,
1,URS0000001346_9606,GO:0030533,enables,RNAcentral,URS0000001346_9606,,GO_REF:0000115,IEA,Rfam:RF00005,F,Homo sapiens (human) tRNA-Lys,,tRNA,taxon:9606,20190504,RNAcentral,,
2,URS000000192A_9606,GO:0035068,part_of,RNAcentral,URS000000192A_9606,,GO_REF:0000115,IEA,Rfam:RF00951,C,Homo sapiens (human) MIR1302-2 host gene (MIR1...,,lnc_RNA,taxon:9606,20190504,RNAcentral,,
3,URS000000192A_9606,GO:0035195,involved_in,RNAcentral,URS000000192A_9606,,GO_REF:0000115,IEA,Rfam:RF00951,P,Homo sapiens (human) MIR1302-2 host gene (MIR1...,,lnc_RNA,taxon:9606,20190504,RNAcentral,,
4,URS00000019BC_9606,GO:0000244,involved_in,RNAcentral,URS00000019BC_9606,,GO_REF:0000115,IEA,Rfam:RF00026,P,Homo sapiens (human) snRNA-U6-related,,snRNA,taxon:9606,20190504,RNAcentral,,


In [73]:
edges_go_rna = edges_go_rna[~edges_go_rna['qualifier'].str.contains('NOT').fillna(False)]

In [74]:
edges_go_rna['start_id'] = 'RNAC:' + edges_go_rna['start_id']

In [75]:
edges_go_rna['source'] = 'Gene Ontology'
edges_go_rna['license'] = 'CC-BY 4.0'

edges_go_rna['dsrc_type'] = edges_go_rna['evidence_code'].apply(determine_evidence)

comp_idx = edges_go_rna.query('dsrc_type == "computed"').index
edges_go_rna.loc[comp_idx, 'comp_type'] = edges_go_rna.loc[comp_idx, 'evidence_code']

In [76]:
new_edges.append(edges_go_rna)

#### Need some more info to add RNA as node type, specifically names

Some other DBs with interesting MicroRNA related edges use the name as an identifier rather than an RNA Central ID

In [77]:
rna_colnames = ['db', 'db_object_id', 'db_object_symbol', 'db_object_name', 'db_object_synonyms', 'db_object_type',
 'taxon', 'parent_object_id', 'db_xrefs', 'properties']

rna_c = pd.read_csv(load_dir.joinpath('rnacentral.gpi.gz'), header=None, names=rna_colnames, 
                    dtype=str, comment='!', sep='\t')
rna_c.head(10)

Unnamed: 0,db,db_object_id,db_object_symbol,db_object_name,db_object_synonyms,db_object_type,taxon,parent_object_id,db_xrefs,properties
0,RNAcentral,URS00006753F8_4081,,Solanum lycopersicum (tomato) tRNA-Tyr for ant...,,tRNA,taxon:4081,,,
1,RNAcentral,URS0000675402_7159,,Aedes aegypti tRNA,,tRNA,taxon:7159,,,
2,RNAcentral,URS0000675413_6945,,Ixodes scapularis tRNA,,tRNA,taxon:6945,,,
3,RNAcentral,URS0000675414_59463,,Myotis lucifugus (little brown bat) snRNA U6 s...,,snRNA,taxon:59463,,,
4,RNAcentral,URS000067541A_60711,,Chlorocebus sabaeus Small nucleolar RNA U13,,snoRNA,taxon:60711,,,
5,RNAcentral,URS000067541A_9544,,Macaca mulatta Small nucleolar RNA U13,,snoRNA,taxon:9544,,,
6,RNAcentral,URS000067541C_4558,,Sorghum bicolor Plant small nucleolar RNA R71,,snoRNA,taxon:4558,,,
7,RNAcentral,URS000067541F_1696176,,Pelagibacteraceae bacterium GOM-A4 bablM sRNA,,ncRNA,taxon:1696176,,,
8,RNAcentral,URS0000675420_15368,,Brachypodium distachyon microRNA MIR1122,,primary_transcript,taxon:15368,,,
9,RNAcentral,URS0000675421_9483,,Callithrix jacchus (white-tufted-ear marmoset)...,,snRNA,taxon:9483,,,


In [78]:
rna_c['db_object_id'] = 'RNAC:' + rna_c['db_object_id']

Human microRNA names start with `hsa-`.  However, this DB seems to have a lot of other text in the name than the smile `hsa-` value

In [79]:
hsa_lines = rna_c['db_object_name'].str.contains('hsa-')
rna_c['rna_name'] = rna_c[hsa_lines]['db_object_name'].apply(lambda s: s[s.index('hsa-'):])
rna_c['rna_name'] = rna_c['rna_name'].fillna(rna_c['db_object_name'])

In [80]:
keep_rna_ids = edges_go_rna['start_id'].unique()

In [81]:
rna_nodes = rna_c.query('db_object_id in @keep_rna_ids and db_object_id not in @mapped_rnas')
rna_nodes = rna_nodes.rename(columns={'rna_name': 'name', 'db_object_id': 'id'})
rna_nodes['label'] = 'Micro RNA'
rna_nodes[['id', 'name', 'label']].head()

Unnamed: 0,id,name,label
369,RNAC:URS0000675799_9606,Homo sapiens microRNA mir-1255,Micro RNA
461,RNAC:URS000067588D_9606,Homo sapiens microRNA mir-720,Micro RNA
468,RNAC:URS00006758A2_9606,Homo sapiens U6 spliceosomal RNA,Micro RNA
722,RNAC:URS0000675BE7_9606,Homo sapiens microRNA mir-1302,Micro RNA
783,RNAC:URS0000675C99_9606,Homo sapiens U2 spliceosomal RNA,Micro RNA


In [82]:
new_nodes.append(rna_nodes[['id', 'name', 'label']])

### Now get some interesting RNA to Gene Endges

miRTarBase has some interesting gene micronra edges, however, they don't have any good external identifiers... all of thier entries start with hsa-miR so we will have to use names for merging, leaving only a subset of those from rna_C

In [83]:
rna_names = rna_c[hsa_lines][~rna_c[hsa_lines]['rna_name'].str.contains('precursor')]['rna_name'].values

In [84]:
mti = pd.read_excel(load_dir.joinpath('hsa_MTI.xlsx'))
mti.head(2)

Unnamed: 0,miRTarBase ID,miRNA,Species (miRNA),Target Gene,Target Gene (Entrez Gene ID),Species (Target Gene),Experiments,Support Type,References (PMID)
0,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,3091,Homo sapiens,Luciferase reporter assay//Western blot//North...,Functional MTI,18632605
1,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,3091,Homo sapiens,HITS-CLIP,Functional MTI (Weak),22473208


In [85]:
mti.columns = regularize_colnames(mti.columns)
mti.head(2)

Unnamed: 0,mirtarbase_id,mi_rna,species_mirna,target_gene,target_gene_entrez_gene_id,species_target_gene,experiments,support_type,references_pmid
0,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,3091,Homo sapiens,Luciferase reporter assay//Western blot//North...,Functional MTI,18632605
1,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,3091,Homo sapiens,HITS-CLIP,Functional MTI (Weak),22473208


In [86]:
mti['target_gene_entrez_gene_id'] = 'NCBIGene:'+ mti['target_gene_entrez_gene_id'].astype(str)

In [87]:
gene_ids = all_nodes.query('label == "Gene"')['id']
print('MicroRNA Gene Annotations mappable to RNACentral')
print('  Total:    {:9,}'.format(len(mti)))
print('  Mapped:   {:9,}'.format(len(mti.query('mi_rna in @rna_names'))))
print('  Unmapped: {:9,}'.format(
    len(mti) - len(mti.query('mi_rna in @rna_names and target_gene_entrez_gene_id in @gene_ids'))))


MicroRNA Gene Annotations mappable to RNACentral
  Total:      502,652
  Mapped:     493,614
  Unmapped:     9,887


In [88]:
rna_map = rna_nodes.set_index('name')['id'].to_dict()

In [89]:
edges_rna_gene = mti.query('mi_rna in @rna_names and target_gene_entrez_gene_id in @gene_ids').reset_index(drop=True)
edges_rna_gene = edges_rna_gene.rename(columns={'target_gene_entrez_gene_id': 'end_id'}) 
edges_rna_gene['start_id'] = edges_rna_gene['mi_rna'].map(rna_map)
edges_rna_gene['end_id'] = edges_rna_gene['end_id'].astype(str)
edges_rna_gene['type'] = 'regulates_NrG'
edges_rna_gene = edges_rna_gene[fix_col_order_edge(edges_rna_gene)]
edges_rna_gene['references_pmid'] = edges_rna_gene['references_pmid'].astype(str)
edges_rna_gene.head(2)

Unnamed: 0,start_id,end_id,type,mirtarbase_id,mi_rna,species_mirna,target_gene,species_target_gene,experiments,support_type,references_pmid
0,RNAC:URS0000574A2C_9606,NCBIGene:3091,regulates_NrG,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,Homo sapiens,Luciferase reporter assay//Western blot//North...,Functional MTI,18632605
1,RNAC:URS0000574A2C_9606,NCBIGene:3091,regulates_NrG,MIRT000002,hsa-miR-20a-5p,Homo sapiens,HIF1A,Homo sapiens,HITS-CLIP,Functional MTI (Weak),22473208


miRTarBase uses a custom license, but looks closest

In [90]:
edges_rna_gene['source'] = 'miRTarBase'
edges_rna_gene['license'] = 'custom open attribute'

edges_rna_gene['dsrc_type'] = 'curated'

In [91]:
new_edges.append(edges_rna_gene[['start_id', 'end_id', 'type', 'source', 'dsrc_type',
                                 'experiments', 'support_type', 'references_pmid']])

# Putting it together

In [92]:
new_nodes_df = pd.concat(new_nodes, sort=False, ignore_index=True)

In [93]:
new_nodes_df.head(2)

Unnamed: 0,id,name,label
0,CPX:CPX-1196,Polybromo-associated SWI/SNF ATP-dependent chr...,Complex
1,CPX:CPX-1201,Neural progenitor-specific SWI/SNF ATP-depende...,Complex


In [94]:
new_edges_df = pd.concat(new_edges, sort=False, ignore_index=True)

In [95]:
keep_cols = ['start_id', 'end_id', 'type', 'dsrc_type', 'comp_type', 'source', 'license', 'experiments', 'support_type', 'references_pmid']

In [96]:
new_edges_df[keep_cols].head(2)

Unnamed: 0,start_id,end_id,type,dsrc_type,comp_type,source,license,experiments,support_type,references_pmid
0,NCBIGene:552891,GO:0003924,enables,computed,IEA,Gene Ontology,CC-BY 4.0,,,
1,NCBIGene:28784,GO:0002250,involved_in,computed,IEA,Gene Ontology,CC-BY 4.0,,,


In [97]:
print('Total number of new edges: {:,}'.format(len(new_edges_df)))
print('Number of unique new edges: {:,}'.format(len(new_edges_df.drop_duplicates(subset=['start_id', 'end_id', 'type']))))

Total number of new edges: 1,014,154
Number of unique new edges: 429,450


In [98]:
%%time
# Need to combine those that have multiple of whatever... 
new_edges_df = combine_group_cols_on_char(new_edges_df, group_on=['start_id', 'end_id', 'type'], 
                                          combine_cols=['source', 'dsrc_type', 'comp_type', 'license', 'references_pmid'],
                                          sort=True, prog=False)

CPU times: user 48.4 s, sys: 400 ms, total: 48.8 s
Wall time: 48.8 s


In [99]:
new_edges_df = new_edges_df[keep_cols]
new_edges_df = new_edges_df.rename(columns={'references_pmid': 'pmids'})

In [100]:
new_edges_df.head(2)

Unnamed: 0,start_id,end_id,type,dsrc_type,comp_type,source,license,experiments,support_type,pmids
0,NCBIGene:552891,GO:0003924,enables,computed,IEA,Gene Ontology,CC-BY 4.0,,,
1,NCBIGene:28784,GO:0002250,involved_in,computed,IEA,Gene Ontology,CC-BY 4.0,,,


In [101]:
all_nodes_out = pd.concat([all_nodes, new_nodes_df], sort=False)
all_nodes_out.head(2)

Unnamed: 0,id,name,label,xrefs
0,UBERON:0000002,cervix,Anatomy,MESH:D002584|UBERON:0000002
1,UBERON:0000004,human nose,Anatomy,MESH:D009666|UBERON:0000004


In [102]:
len(all_nodes_out)

794626

In [103]:
all_edges_out = pd.concat([edges, new_edges_df], sort=False)
all_edges_out.head(2)

Unnamed: 0,start_id,end_id,type,dsrc_type,comp_type,p_val,adj_p,source,license,experiments,support_type,pmids
0,CHEMBL:CHEMBL1743034,NCBIGene:3605,Neutralizing antibody,computed,merge,,,WikiData,CC0 1.0,,,
1,CHEBI:10055,NCBIGene:153,agonist,computed,merge,,,WikiData,CC0 1.0,,,


In [104]:
node_ids = all_nodes_out['id'].unique()

In [105]:
len(all_nodes_out), all_nodes_out['id'].nunique()

(794626, 794626)

In [106]:
# Double check that all of the edges have a startID and End ID contained in the Nodes...
print('Edges all IDs: {:,}'.format(len(all_edges_out)))
all_edges_out_filt = all_edges_out.query('start_id in @node_ids and end_id in @node_ids')
print('Edges in node IDs: {:,}'.format(len(all_edges_out_filt)))

Edges all IDs: 3,044,748
Edges in node IDs: 3,040,799


In [107]:
# Lests see how many of what kinds of edges had to be dropped...
all_edges_out.query('start_id not in @node_ids or end_id not in @node_ids')['type'].value_counts()

regulates_NrG    1267
involved_in       994
enables           914
part_of           774
Name: type, dtype: int64

These are all RNAs that could not be mapped properly to a name... they also have very minimal information when accessed on RNACentral, and would increase the number of miRNA nodes by a factor of 5.

In [108]:
print('Edges: {:,}'.format(len(all_edges_out_filt)))
print('Unique Edges: {:,}'.format(len(all_edges_out_filt.drop_duplicates(subset=['start_id', 'end_id', 'type']))))

all_edges_out_filt = combine_group_cols_on_char(all_edges_out_filt, ['start_id', 'end_id', 'type'], sort=True, prog=True)

Edges: 3,040,799
Unique Edges: 2,770,560


HBox(children=(FloatProgress(value=0.0, description='total_progress', max=4.0, style=ProgressStyle(description…

  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, description='dsrc_type', max=270239.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='comp_type', max=270239.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='source', max=270239.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='license', max=270239.0, style=ProgressStyle(description_w…





In [109]:
all_edge_ids = all_edges_out_filt[['start_id', 'end_id']].stack().unique()

In [110]:
filt_nodes_out = all_nodes_out.query('id in @all_edge_ids')
print(len(all_nodes_out))
print(len(filt_nodes_out))

794626
357456


#### Examine the new edges

In [111]:
combo = gt.combine_nodes_and_edges(filt_nodes_out, all_edges_out_filt)

In [112]:
go_labels = ['Molecular Function', 'Biological Process', 'Cellular Component']
combo.query('start_label in @go_labels or end_label in @go_labels')['source'].value_counts()

WikiData                           541659
Gene Ontology|WikiData             239429
Reactome                           105961
Gene Ontology                       47794
Gene Ontology|Reactome|WikiData     29459
InterPro                            25416
Reactome|WikiData                    3197
Gene Ontology|Reactome                 36
Name: source, dtype: int64

In [113]:
combo.query('start_label in ["Gene", "Protein"] and end_label in @go_labels')['source'].value_counts()

WikiData                           427148
Gene Ontology|WikiData             239429
Reactome                            54087
Gene Ontology|Reactome|WikiData     29459
Gene Ontology                        5486
Reactome|WikiData                    2249
Gene Ontology|Reactome                 36
Name: source, dtype: int64

In [114]:
filt_nodes_out['label'].value_counts()

Gene                  156034
Protein                82605
Micro RNA              17722
Reaction               16770
Protein Family         13842
Biological Process     13827
Complex                13366
Taxon                  12227
Disease                11760
Molecular Function      5471
Compound                5170
Pathway                 4841
Cellular Component      2051
Phenotype               1255
Anatomy                  515
Name: label, dtype: int64

In [115]:
all_edges_out_filt['type'].value_counts()

part_of                                  1077013
involved_in                               327315
enables                                   243061
has_input                                 218486
in_taxon                                  218079
has_output                                215947
associated_with                           160317
regulates_NrG                              94641
causes                                     72826
treats                                     41374
site_of                                    27216
has_part                                   26017
regulates                                  17108
follows_in_sequence                        10363
presents                                    8566
fucntion_altered_in                         4716
positively_regulates                        1665
agonist                                     1326
antagonist                                  1085
enzyme inhibitor                             992
negatively_regulates

In [116]:
all_edges_out_filt.columns

Index(['start_id', 'end_id', 'type', 'dsrc_type', 'comp_type', 'p_val',
       'adj_p', 'source', 'license', 'experiments', 'support_type', 'pmids'],
      dtype='object')

In [117]:
all_edges_out_filt.groupby('source')['dsrc_type'].value_counts()

source                           dsrc_type            
ComplexPortal                    curated                     2779
ComplexPortal|Reactome           curated                     1315
Gene Ontology                    computed                   40872
                                 curated                     6820
                                 computed|curated             102
Gene Ontology|Reactome           curated                       36
Gene Ontology|Reactome|WikiData  computed|curated           29459
Gene Ontology|WikiData           computed|curated          147226
                                 computed                   92203
InterPro                         curated                    25416
Reactome                         curated                  1057148
                                 computed                  252105
Reactome|WikiData                crowd_sourced|curated       5637
                                 computed|curated            1770
WikiData             

# Save

In [118]:
this_name = '05a_GO_Annotations'

In [119]:
out_dir = Path('../2_pipeline/').joinpath(this_name).joinpath('out').resolve()
out_dir.mkdir(parents=True, exist_ok=True)

In [120]:
gt.add_colons(new_nodes_df, id_name='identifier').to_csv(out_dir.joinpath('new_nodes.csv'), index=False)
gt.add_colons(new_edges_df).to_csv(out_dir.joinpath('new_edges.csv'), index=False)

In [121]:
all_nodes_out.to_csv(out_dir.joinpath('nodes.csv'), index=False)
filt_nodes_out.to_csv(out_dir.joinpath('nodes_filt.csv'), index=False)

all_edges_out_filt.to_csv(out_dir.joinpath('edges.csv'), index=False)