# Go Term Analysis (dev)

In [1]:
import os

os.chdir('../..')
os.getcwd()

'/Users/vschuste/Documents/work_KU/projects/interpreting_omics_models'

## Go terms to gene lists prep

### Get GO term graph

In [2]:
# get go term reference 
if not os.path.exists('01_data/go-basic.obo'):
    os.chdir('01_data')
    !wget https://purl.obolibrary.org/obo/go/go-basic.obo -O go-basic.obo
    !pip install goatools
    os.chdir('..')
from goatools.obo_parser import GODag
obodag = GODag("01_data/go-basic.obo")

01_data/go-basic.obo: fmt(1.2) rel(2024-10-27) 44,017 Terms


In [3]:
obodag

{'GO:0000001': GOTerm('GO:0000001'):
   id:GO:0000001
   item_id:GO:0000001
   name:mitochondrion inheritance
   namespace:biological_process
   _parents: 2 items
     GO:0048308
     GO:0048311
   parents: 2 items
     GO:0048308	level-05	depth-05	organelle inheritance [biological_process]
     GO:0048311	level-04	depth-04	mitochondrion distribution [biological_process]
   children: 0 items
   level:5
   depth:6
   is_obsolete:False
   alt_ids: 0 items,
 'GO:0000002': GOTerm('GO:0000002'):
   id:GO:0000002
   item_id:GO:0000002
   name:mitochondrial genome maintenance
   namespace:biological_process
   _parents: 1 items
     GO:0007005
   parents: 1 items
     GO:0007005	level-05	depth-05	mitochondrion organization [biological_process]
   children: 0 items
   level:6
   depth:6
   is_obsolete:False
   alt_ids: 0 items,
 'GO:0000006': GOTerm('GO:0000006'):
   id:GO:0000006
   item_id:GO:0000006
   name:high-affinity zinc transmembrane transporter activity
   namespace:molecular_functio

In [26]:
# create a dataframe with each go term and its level
import pandas as pd

go_names = []
go_levels = []
for go_id in obodag:
    go_names.append(obodag[go_id].name)
    go_levels.append(obodag[go_id].level)

df_go_levels = pd.DataFrame({'go_id': list(obodag.keys()), 'go_name': go_names, 'go_level': go_levels})
df_go_levels

Unnamed: 0,go_id,go_name,go_level
0,GO:0000001,mitochondrion inheritance,5
1,GO:0000002,mitochondrial genome maintenance,6
2,GO:0000006,high-affinity zinc transmembrane transporter a...,8
3,GO:0000007,low-affinity zinc ion transmembrane transporte...,8
4,GO:0000009,"alpha-1,6-mannosyltransferase activity",6
...,...,...,...
44012,GO:0090645,ubiquitin ligase inhibitor activity,5
44013,GO:0035926,regulation of chemokine (C-X-C motif) ligand 2...,6
44014,GO:1904207,regulation of chemokine (C-X-C motif) ligand 2...,6
44015,GO:1904208,negative regulation of chemokine (C-X-C motif)...,7


In [29]:
# save the dataframe
df_go_levels.to_csv('01_data/go_term_levels.tsv', sep='\t', index=False)

### Get proteins and associated GO terms

In [3]:
# get go term reference 
if not os.path.exists('01_data/goa_human.gaf'):
    os.chdir('01_data')
    !wget https://current.geneontology.org/annotations/goa_human.gaf.gz -O goa_human.gaf.gz
    !gunzip goa_human.gaf.gz
    !rm goa_human.gaf.gz
    os.chdir('..')
from goatools.anno.gaf_reader import GafReader

ogaf = GafReader("01_data/goa_human.gaf")

HMS:0:00:07.819722 782,823 annotations READ: 01_data/goa_human.gaf 


In [4]:
ns2assc = ogaf.get_ns2assc()

In [10]:
ns2assc

{'MF': {'A0A024RBG1': {'GO:0000298',
   'GO:0003723',
   'GO:0005515',
   'GO:0008486',
   'GO:0034431',
   'GO:0034432',
   'GO:0046872'},
  'A0A075B759': {'GO:0003755', 'GO:0016018', 'GO:0140839', 'GO:0140840'},
  'A0A075B767': {'GO:0003755', 'GO:0016018', 'GO:0140839', 'GO:0140840'},
  'A0A087WT01': {'GO:0005515'},
  'A0A087WUV0': {'GO:0000978', 'GO:0000981', 'GO:0046872'},
  'A0A087X1C5': {'GO:0005506', 'GO:0016712', 'GO:0020037', 'GO:0070330'},
  'A0A096LNW5': {'GO:0005509'},
  'A0A096LP01': {'GO:0005515', 'GO:0019901', 'GO:0044325'},
  'A0A096LP49': {'GO:0008017'},
  'A0A096LPK9': {'GO:0004930', 'GO:0004984'},
  'A0A0A6YYK7': {'GO:0042287', 'GO:0042605'},
  'A0A0B4J2A2': {'GO:0003755', 'GO:0016018', 'GO:0140839', 'GO:0140840'},
  'A0A0B4J2F0': {'GO:0005515'},
  'A0A0C5B5G6': {'GO:0003677', 'GO:0140297'},
  'A0A0G2JMH3': {'GO:0003924', 'GO:0005525'},
  'A0A0G2JNH3': {'GO:0004930', 'GO:0004984'},
  'A0A0J9YWL9': {'GO:0003729', 'GO:0046872'},
  'A0A0J9YY54': {'GO:0003729', 'GO:00468

In [5]:
all_proteins = list(ns2assc['BP'].keys()) + list(ns2assc['MF'].keys()) + list(ns2assc['CC'].keys())
all_proteins = list(set(all_proteins))
print(len(all_proteins))

44588


### Get protein to ensembl mapping

In [6]:
# I need protein name to ensemble conversion
import pandas as pd

if not os.path.exists('01_data/protname2ensembl.tsv'):
    os.chdir('01_data')
    if not os.path.exists('idmapping_selected.tab'):
        !wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping_selected.tab.gz
        !gunzip idmapping_selected.tab.gz
    else:
        print('idmapping_selected.tab already downloaded')
    os.chdir('..')
    id_mapping = pd.read_csv('01_data/idmapping_selected.tab', sep='\t', header=None)
    # filter for only protein id and ensembl id and human proteins
    prot2ensembl = id_mapping[[0, 16, 18]]
    prot2ensembl.columns = ['UniProtKB-AC', 'EMBL', 'Ensembl']
    prot2ensembl = prot2ensembl.dropna()
    # save the file
    prot2ensembl.to_csv('01_data/protname2ensembl.tsv', sep='\t', index=False)
    !rm idmapping_selected.tab.gz
    !rm idmapping_selected.tab
else:
    prot2ensembl = pd.read_csv('01_data/protname2ensembl.tsv', sep='\t')

In [13]:
prot2ensembl

Unnamed: 0,UniProtKB-AC,EMBL,Ensembl
0,Q60888,BC141260; AY318157; AY073853; U28777,ENSMUSG00000062121.4
1,Q8VBW9,AF281257; AY073151; AY317466,ENSMUSG00000054666.8
2,Q60885,AY073230; AY317937; U28774,ENSMUSG00000069430.4
3,Q5ZLQ6,AJ719678,ENSGALG00000004143; ENSGALG00010022076.1; ENSG...
4,P31946,X57346; AK292717; AL008725; CH471077; CH471077...,ENSG00000166913.13
...,...,...,...
879405,A7KII2,EF427379,ENSSSAG00000040701
879406,A0A1B8XVI6,KV461331,ENSXETG00000027086
879407,B7ZTZ7,BC171051,ENSXETG00000038419
879408,B7ZTQ4,BC170957,ENSXETG00000019929


In [19]:
human_idx = [i for i, x in enumerate(prot2ensembl['Ensembl']) if 'ENSG0' in x]
prot2ensembl.iloc[human_idx]

Unnamed: 0,UniProtKB-AC,EMBL,Ensembl
4,P31946,X57346; AK292717; AL008725; CH471077; CH471077...,ENSG00000166913.13
12,P62258,U20972; U54778; U43399; U43430; U28936; AB0171...,ENSG00000108953.17; ENSG00000274474.3
16,Q04917,L20422; X80536; X78138; X57345; D78577; S80794...,ENSG00000128245.15
19,P61981,AF142498; AB024334; CR541904; CR541925; AC0063...,ENSG00000170027.7
23,P31947,M93010; X57348; AF029081; AF029082; CR541905; ...,ENSG00000175793.12
...,...,...,...
112158,H7C3R6,AC114489; AL033529; AL138800; KF495898; KF495904,ENSG00000176261
112159,A0A0A6YYI9,AP001157,ENSG00000248643
112160,C9IZ13,AC110491,ENSG00000113966.10
112161,A0A2R8YD15,AC022506,ENSG00000285133


### Load sc data to get relevant genes

In [7]:
# test out what names I need to convert to to match my data

# load data
import anndata as ad

adata = ad.read_h5ad('01_data/human_bonemarrow.h5ad')

In [16]:
adata.var

Unnamed: 0,feature_types,gene_id,modality
AL627309.5,GEX,ENSG00000241860,GEX
LINC01409,GEX,ENSG00000237491,GEX
LINC01128,GEX,ENSG00000228794,GEX
NOC2L,GEX,ENSG00000188976,GEX
KLHL17,GEX,ENSG00000187961,GEX
...,...,...,...
GL000219.1-90062-90937,ATAC,,ATAC
GL000219.1-99257-100160,ATAC,,ATAC
KI270726.1-27152-28034,ATAC,,ATAC
KI270713.1-21434-22336,ATAC,,ATAC


In [12]:
sc_gene_ids = adata.var[adata.var['modality']=='GEX']['gene_id'].values
sc_gene_ids

['ENSG00000241860', 'ENSG00000237491', 'ENSG00000228794', 'ENSG00000188976', 'ENSG00000187961', ..., 'ENSG00000198786', 'ENSG00000198695', 'ENSG00000198727', 'ENSG00000273748', 'ENSG00000271254']
Length: 13431
Categories (13431, object): ['ENSG00000000419', 'ENSG00000000457', 'ENSG00000000460', 'ENSG00000000938', ..., 'ENSG00000288093', 'ENSG00000288107', 'ENSG00000288156', 'ENSG00000288380']

In [21]:
def find_prot_entry(gene_id):
    idx = [i for i, x in enumerate(prot2ensembl['Ensembl']) if gene_id in x]
    if len(idx) > 0:
        return idx[0]
    else:
        return None

found_entries = []
for gene_id in sc_gene_ids:
    found_entries.append(find_prot_entry(gene_id))
prot_names = [prot2ensembl.iloc[i]['UniProtKB-AC'] if i is not None else None for i in found_entries]
prot_names

[None,
 None,
 None,
 'Q9Y3T9',
 'Q6TDP4',
 'P05161',
 'Q96HA4',
 'G3V1E2',
 'Q96L58',
 'Q8N2K1',
 'Q96P50',
 'Q5TA45',
 'O14640',
 'Q9NWT8',
 'Q96S94',
 'Q9BYC9',
 'Q5T9A4',
 'Q9NVI7',
 'Q9NP77',
 'Q96AX9',
 'O75900',
 'P21127',
 'P0CK96',
 'Q9UQ88',
 None,
 'O95544',
 'P62873',
 'Q05513',
 'Q6NZ36',
 'P12755',
 'Q5T089',
 'O15258',
 'O60683',
 'O75038',
 'Q9NVE7',
 None,
 None,
 'Q92956',
 'Q8TBF2',
 'A8MYJ7',
 'O75095',
 'Q5T0D9',
 'Q9P2S5',
 'B2RUZ4',
 'Q8N1G4',
 'O60308',
 'O76075',
 'Q8IYL3',
 'O75161',
 'Q13303',
 'P35268',
 'O60725',
 'Q6NV75',
 'O00154',
 'B1AK53',
 'Q93038',
 'Q5SY16',
 'P10074',
 'Q9UJP4',
 'Q86YI8',
 'Q8WTV1',
 'Q9NVH1',
 'Q9Y6Y1',
 'Q15836',
 'P56645',
 'Q07011',
 'Q99497',
 'Q9UJM3',
 'Q9P2R6',
 None,
 'P06733',
 'P22732',
 'Q5UAW9',
 'O95479',
 'Q96BD6',
 'Q9BSK2',
 'Q5SNT2',
 'O00329',
 None,
 'O94985',
 'Q9NSA3',
 'Q8WZA0',
 'Q9HAN9',
 'Q96R05',
 'O95155',
 'O60333',
 'P52209',
 'O00273',
 'O75381',
 'Q86V15',
 'Q13148',
 None,
 'P19623',
 'Q01780',
 '

In [22]:
# how many are missing
sum([x is None for x in prot_names])

1757

## link go terms to adata-formatted vector ("gene list")

In [30]:
import numpy as np
go_gene_matrix  = np.zeros((len(obodag.keys()), len(prot_names)))

# go through all go terms
count = 0
for protein in prot_names:
    if protein is not None:
        if protein in ns2assc['BP']:
            if len(ns2assc['BP'][protein]) > 0:
                for go_term in ns2assc['BP'][protein]:
                    if go_term in obodag.keys():
                        go_gene_matrix[list(obodag.keys()).index(go_term), prot_names.index(protein)] = 1
        if protein in ns2assc['MF']:
            if len(ns2assc['MF'][protein]) > 0:
                for go_term in ns2assc['MF'][protein]:
                    if go_term in obodag.keys():
                        go_gene_matrix[list(obodag.keys()).index(go_term), prot_names.index(protein)] = 1
        if protein in ns2assc['CC']:
            if len(ns2assc['CC'][protein]) > 0:
                for go_term in ns2assc['CC'][protein]:
                    if go_term in obodag.keys():
                        go_gene_matrix[list(obodag.keys()).index(go_term), prot_names.index(protein)] = 1

In [40]:
# actually save it as ann andata object with sparse matrix
import scipy.sparse as sp

adata_go = ad.AnnData(sp.csr_matrix(go_gene_matrix), obs=df_go_levels, var=pd.DataFrame({'Ensembl': sc_gene_ids, 'UniProtKB-AC': prot_names}, index=sc_gene_ids))
adata_go.write('01_data/go_gene_matrix.h5ad')
adata_go

  adata_go = ad.AnnData(sp.csr_matrix(go_gene_matrix), obs=df_go_levels, var=pd.DataFrame({'Ensembl': sc_gene_ids, 'UniProtKB-AC': prot_names}, index=sc_gene_ids))


AnnData object with n_obs × n_vars = 44017 × 13431
    obs: 'go_id', 'go_name', 'go_level'
    var: 'Ensembl', 'UniProtKB-AC'