In [2]:
import obonet
import pandas as pd
import numpy as np 

In [2]:
go_graph = obonet.read_obo('/mnt/d/ML/Kaggle/CAFA6/cafa-6-protein-function-prediction/Train/go-basic.obo')

In [10]:
go_graph.nodes['GO:0000001']

{'name': 'mitochondrion inheritance',
 'namespace': 'biological_process',
 'def': '"The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton." [GOC:mcc, PMID:10873824, PMID:11389764]',
 'synonym': ['"mitochondrial inheritance" EXACT []'],
 'is_a': ['GO:0048308', 'GO:0048311']}

In [7]:
# Create GO term to aspect dictionary
go_term_to_aspect = {}

# Mapping of namespace to aspect
namespace_to_aspect = {
    'biological_process': 'P',
    'molecular_function': 'F',
    'cellular_component': 'C'
}

# Iterate through all nodes in the GO graph
for go_id, data in go_graph.nodes(data=True):
    namespace = data.get('namespace', '')
    aspect = namespace_to_aspect.get(namespace, None)
    if aspect:
        go_term_to_aspect[go_id] = aspect

print(f"Total GO terms mapped: {len(go_term_to_aspect)}")
print(f"\nAspect distribution:")
aspect_counts = pd.Series(list(go_term_to_aspect.values())).value_counts()
print(aspect_counts)
print(f"\nExample mappings:")
for i, (term, aspect) in enumerate(list(go_term_to_aspect.items())[:5]):
    print(f"{term}: {aspect}")

Total GO terms mapped: 40122

Aspect distribution:
P    25950
F    10131
C     4041
Name: count, dtype: int64

Example mappings:
GO:0000001: P
GO:0000002: P
GO:0000006: F
GO:0000007: F
GO:0000009: F


In [8]:
#save to npy
np.save('/mnt/d/ML/Kaggle/CAFA6-new/data_packet1/go_term_to_aspect.npy', go_term_to_aspect)

In [4]:
train_terms_df = pd.read_csv('/mnt/d/ML/Kaggle/CAFA6/cafa-6-protein-function-prediction/Train/train_terms.tsv', sep='\t')
train_terms_df.head()

Unnamed: 0,EntryID,term,aspect
0,Q5W0B1,GO:0000785,C
1,Q5W0B1,GO:0004842,F
2,Q5W0B1,GO:0051865,P
3,Q5W0B1,GO:0006275,P
4,Q5W0B1,GO:0006513,P


In [12]:
train_go_terms = train_terms_df.groupby('term').first().reset_index() #.map(go_term_to_aspect).value_counts()
train_go_terms['aspect'].value_counts()

aspect
P    16858
F     6616
C     2651
Name: count, dtype: int64

In [20]:
ia_df = pd.read_csv("/mnt/d/ML/Kaggle/CAFA6/cafa-6-protein-function-prediction/IA.tsv", sep="\t", header=None, names=["term", "ia"])
ia_df.head()

Unnamed: 0,term,ia
0,GO:0000001,0.0
1,GO:0000002,2.849666
2,GO:0000011,0.137504
3,GO:0000012,6.03863
4,GO:0000017,0.514573


In [21]:
(ia_df['ia'] == 0).sum()

np.int64(11828)

In [19]:
ia_df['ia'].describe()

count    40122.000000
mean         1.716517
std          2.637009
min          0.000000
25%          0.000000
50%          0.150287
75%          2.705504
max         14.861014
Name: ia, dtype: float64