# GO annotation analyses

1. Find GO annotations in GFF file for each genome
2. Run through the files and come up with a list of unique GO IDs
3. Create a table with presence/absence for all genomes

### 1. Find GO annotations in GFF file for each genome, save each list to an individual file

In [7]:
import os

def find_go_annotations(gff_file):
    go_terms = set([])
    with open(gff_file, "r") as rf:
        for line in rf:
            if not line.startswith("#"):
                fields = line.split("\t")[-1].split(";")
                for field in fields:
                    if field.startswith("Ontology_term"):
                        field_data = field[field.index("=")+1:]
                        go_terms.update([term[3:] for term in field_data.split(",")])

    genome = gff_file.split("/")[-2]
    outfile = "/n/scratch/users/a/aip485/bac_genome_constraint/results/3_GO_analyses/GO_lists/" + genome + ".txt"
    with open(outfile, "w") as wf:
        for term in go_terms:
            wf.write(term + "\n")

In [8]:
test_id = "GCF_030643905.1"
test_path = "/n/scratch/users/b/byc014/github/bac_genome_constraint/data/ncbi/assemblies/GCF_030643905.1/genomic.gff"

find_go_annotations(test_path)

In [9]:
import glob

for gff_file in glob.iglob("/n/scratch/users/b/byc014/github/bac_genome_constraint/data/ncbi/assemblies/*/genomic.gff"):
    find_go_annotations(gff_file)

### 2. Run through the files and come up with a list of unique GO IDs

In [14]:
import os

unique_terms = set([])
to_skip = []
for go_list_file in glob.iglob("/n/scratch/users/a/aip485/bac_genome_constraint/results/3_GO_analyses/GO_lists/*.txt"):
    with open(go_list_file, "r") as rf:
        genome_terms = [int(line.strip()) for line in rf]
        if len(genome_terms) == 0:
            to_skip.append(os.path.basename(go_list_file))
        unique_terms.update(genome_terms)

with open("/n/scratch/users/a/aip485/bac_genome_constraint/results/3_GO_analyses/unique_terms.txt", "w") as wf:
    for term in sorted(unique_terms):
        wf.write(f"{term:07d}\n")

with open("/n/scratch/users/a/aip485/bac_genome_constraint/results/3_GO_analyses/missing_terms.txt", "w") as wf:
    for genome in to_skip:
        wf.write(genome + "\n")

3. Create a table with presence/absence for all genomes

In [18]:
with open("/n/scratch/users/a/aip485/bac_genome_constraint/results/3_GO_analyses/unique_terms.txt", "r") as rf:
    unique_terms = [line.strip() for line in rf]

with open("/n/scratch/users/a/aip485/bac_genome_constraint/results/3_GO_analyses/missing_terms.txt", "r") as rf:
    to_skip = [line.strip() for line in rf]

genome_match_string = "/n/scratch/users/a/aip485/bac_genome_constraint/results/3_GO_analyses/GO_lists/*.txt"
genome_files = [fn for fn in glob.iglob(genome_match_string) ]

with open("/n/scratch/users/a/aip485/bac_genome_constraint/results/3_GO_analyses/presence_absence_table.txt", "w") as wf:

    # Write header
    wf.write("Genome\t" + "\t".join(unique_terms) + "\n")

    for fn in glob.iglob("/n/scratch/users/a/aip485/bac_genome_constraint/results/3_GO_analyses/GO_lists/*.txt"):

        basename = os.path.basename(fn)
        if basename in to_skip:
            continue

        with open(fn, "r") as rf:
            genome_terms = [line.strip() for line in rf]
        presence_absence = ["1" if term in genome_terms else "0" for term in unique_terms]
        wf.write(basename[:-4] + "\t" + "\t".join(presence_absence) + "\n")
