This script describes how to extract the information we need from the downloaded GO and GOA files. 

Before running the following script, download:
+ GO relationship file "go-basic.obo" from http://purl.obolibrary.org/obo/go/go-basic.obo;
+ GO annotation file "mgi.gaf" for mouse from https://current.geneontology.org/annotations/mgi.gaf.gz (for other species, check the download page https://current.geneontology.org/products/pages/downloads.html and select what you need)

In [1]:
import os
import pandas as pd
from collections import defaultdict
os.chdir("/nfs/public/xixi/scRegulate/go")

# Process GO

In [2]:
filename = "go-basic.obo"
id1 = []
name1 = []
namespace1 = []
def1 = []
relations = []
id2 = []
with open(filename, "r") as infile:
    currentGOTerm = None
    i = 0
    for line in infile:
        line = line.strip()
        if not line: continue  # Skip empty
        if line == "[Term]":
            if i > 0:
                for i in range(len(currentGOTerm['is_a'])):
                    id1.append(currentGOTerm['id'][0])
                    name1.append(currentGOTerm["name"][0])
                    namespace1.append(currentGOTerm["namespace"][0])
                    def1.append(currentGOTerm["def"][0])
                    relations.append("is_a")
                    id2.append(str.split(currentGOTerm['is_a'][i], sep = " ")[0])
                for i in range(len(currentGOTerm['relationship'])):
                    id1.append(currentGOTerm['id'][0])
                    name1.append(currentGOTerm["name"][0])
                    namespace1.append(currentGOTerm["namespace"][0])
                    def1.append(currentGOTerm["def"][0])
                    relations.append(str.split(currentGOTerm['relationship'][i], sep = " ")[0])
                    id2.append(str.split(currentGOTerm['relationship'][i], sep = " ")[1])
            currentGOTerm = defaultdict(list)
            i = i + 1
        elif line == "[Typedef]":
            for i in range(len(currentGOTerm['is_a'])):
                id1.append(currentGOTerm['id'][0])
                name1.append(currentGOTerm["name"][0])
                namespace1.append(currentGOTerm["namespace"][0])
                def1.append(currentGOTerm["def"][0])
                relations.append("is_a")
                id2.append(str.split(currentGOTerm['is_a'][i], sep=" ")[0])
            for i in range(len(currentGOTerm['relationship'])):
                id1.append(currentGOTerm['id'][0])
                name1.append(currentGOTerm["name"][0])
                namespace1.append(currentGOTerm["namespace"][0])
                def1.append(currentGOTerm["def"][0])
                relations.append(str.split(currentGOTerm['relationship'][i], sep=" ")[0])
                id2.append(str.split(currentGOTerm['relationship'][i], sep=" ")[1])
            break
        else:  # Not [Term]
            # Only process if we're inside a [Term] environment
            if currentGOTerm is None: continue
            key, sep, val = line.partition(":")
            currentGOTerm[key].append(val.strip())
goparse = pd.DataFrame({"id1":id1, "name": name1, "namespace": namespace1, "def":def1, "relation":relations, "id2":id2})

In [3]:
goparse

Unnamed: 0,id1,name,namespace,def,relation,id2
0,GO:0000001,mitochondrion inheritance,biological_process,"""The distribution of mitochondria, including t...",is_a,GO:0048308
1,GO:0000001,mitochondrion inheritance,biological_process,"""The distribution of mitochondria, including t...",is_a,GO:0048311
2,GO:0000002,mitochondrial genome maintenance,biological_process,"""The maintenance of the structure and integrit...",is_a,GO:0007005
3,GO:0000003,reproduction,biological_process,"""The production of new individuals that contai...",is_a,GO:0008150
4,GO:0000006,high-affinity zinc transmembrane transporter a...,molecular_function,"""Enables the transfer of zinc ions (Zn2+) from...",is_a,GO:0005385
...,...,...,...,...,...,...
86692,GO:2001317,kojic acid biosynthetic process,biological_process,"""The chemical reactions and pathways resulting...",is_a,GO:0034309
86693,GO:2001317,kojic acid biosynthetic process,biological_process,"""The chemical reactions and pathways resulting...",is_a,GO:0042181
86694,GO:2001317,kojic acid biosynthetic process,biological_process,"""The chemical reactions and pathways resulting...",is_a,GO:0120255
86695,GO:2001317,kojic acid biosynthetic process,biological_process,"""The chemical reactions and pathways resulting...",is_a,GO:1901362


In [4]:
goparse.to_csv("go-organize.txt", sep = "\t", index=None)

# Process GOA

In [5]:
df_goa = pd.read_csv("mgi.gaf", sep='\t', skiprows=36, header=None)
df_goa

  df_goa = pd.read_csv("goa_human.gaf", sep='\t', skiprows=41, header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20230911,UniProt,,
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20230911,UniProt,,
2,UniProtKB,A0A024RBG1,NUDT4B,located_in,GO:0005829,GO_REF:0000052,IDA,,C,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20230619,HPA,,
3,UniProtKB,A0A075B6H7,IGKV3-7,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20230911,UniProt,,
4,UniProtKB,A0A075B6H7,IGKV3-7,located_in,GO:0005886,GO_REF:0000044,IEA,UniProtKB-SubCell:SL-0039,C,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20230911,UniProt,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635263,UniProtKB,O95497,VNN1,involved_in,GO:0015939,GOREF:0000033,IBA,PANTHER:PTN008502038|UniProtKB:O95498|UniProtK...,P,Pantetheinase,VNN1,protein,taxon:9606,20170228,GO_Central,,
635264,UniProtKB,Q15475,SIX1,involved_in,GO:0048741,GOREF:0000033,IBA,PANTHER:PTN002571900|ZFIN:ZDB-GENE-040426-2308...,P,Homeobox protein SIX1,SIX1,protein,taxon:9606,20180309,GO_Central,,
635265,UniProtKB,P50281,MMP14,is_active_in,GO:0005615,GOREF:0000033,IBA,PANTHER:PTN002565758|RGD:621320|RGD:621317|RGD...,C,Matrix metalloproteinase-14,MMP14,protein,taxon:9606,20230405,GO_Central,,
635266,UniProtKB,Q8NCM2,KCNH5,involved_in,GO:0071805,GOREF:0000033,IBA,PANTHER:PTN000025350|UniProtKB:O95259|RGD:6839...,P,Potassium voltage-gated channel subfamily H me...,KCNH5|EAG2,protein,taxon:9606,20231108,GO_Central,,


In [6]:
set(df_goa[3])

{'NOT|acts_upstream_of_or_within',
 'NOT|acts_upstream_of_or_within_negative_effect',
 'NOT|colocalizes_with',
 'NOT|contributes_to',
 'NOT|enables',
 'NOT|involved_in',
 'NOT|is_active_in',
 'NOT|located_in',
 'NOT|part_of',
 'acts_upstream_of',
 'acts_upstream_of_negative_effect',
 'acts_upstream_of_or_within',
 'acts_upstream_of_or_within_negative_effect',
 'acts_upstream_of_or_within_positive_effect',
 'acts_upstream_of_positive_effect',
 'colocalizes_with',
 'contributes_to',
 'enables',
 'involved_in',
 'is_active_in',
 'located_in',
 'part_of'}

In [7]:
df_goa.to_csv("go_annotation.csv", index=None)