## Env setup

In [1]:
import sys
import subprocess

In [5]:
%pip install pandas biopython tqdm

Collecting pandas
  Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting biopython
  Using cached biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
Using cached biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, biopython, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [pandas]2m3/4[0m [pandas]on]
[1A[2KSuccessfully install

In [4]:
import pandas as pd
from collections import Counter

## Take a quick look at the data

In [11]:
train_terms_path = "/workspace/data/Train/train_terms.tsv"
test_fasta_path = "/workspace/data/Test/testsuperset.fasta"

In [12]:
df_terms = pd.read_csv(train_terms_path, sep="\t")

In [13]:
df_terms

Unnamed: 0,EntryID,term,aspect
0,Q5W0B1,GO:0000785,C
1,Q5W0B1,GO:0004842,F
2,Q5W0B1,GO:0051865,P
3,Q5W0B1,GO:0006275,P
4,Q5W0B1,GO:0006513,P
...,...,...,...
537022,Q06667,GO:0070481,P
537023,B1NF19,GO:0033075,P
537024,B1NF19,GO:0047052,F
537025,B1NF19,GO:0047056,F


In [14]:
df_terms = pd.read_csv(train_terms_path, sep='\t', usecols = ['term', 'aspect'])

In [15]:
df_terms

Unnamed: 0,term,aspect
0,GO:0000785,C
1,GO:0004842,F
2,GO:0051865,P
3,GO:0006275,P
4,GO:0006513,P
...,...,...
537022,GO:0070481,P
537023,GO:0033075,P
537024,GO:0047052,F
537025,GO:0047056,F


## Inference

In [17]:
top_n = 15
top_terms = {}

In [20]:
for aspect in ["P", "C", "F"]:
    subset = df_terms[df_terms['aspect'] == aspect]
    counter = Counter(subset['term'])
    most_common = counter.most_common(top_n)
    total_count = len(subset)
    top_terms[aspect] = [(term, count/total_count) for term, count in most_common]
    
    print(f"--- Top 5 {aspect} terms ---")
    print(top_terms[aspect][:5])

--- Top 5 P terms ---
[('GO:0045944', 0.009246227148581568), ('GO:0000122', 0.00618408723909013), ('GO:0006355', 0.005510256972548394), ('GO:0045893', 0.0053746934869719504), ('GO:0045892', 0.004258288311636531)]
--- Top 5 C terms ---
[('GO:0005634', 0.08419217848767192), ('GO:0005829', 0.08265196171642264), ('GO:0005886', 0.06433415731761424), ('GO:0005737', 0.05984661215693731), ('GO:0005739', 0.03680674399442226)]
--- Top 5 F terms ---
[('GO:0005515', 0.2624560146980973), ('GO:0042802', 0.02761342758384455), ('GO:0042803', 0.012666209946127736), ('GO:0003723', 0.012557219817519385), ('GO:0003677', 0.011428393485504312)]


In [23]:
from Bio import SeqIO

test_ids = []
with open(test_fasta_path) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        test_ids.append(record.id)

print(f"Predict {len(test_ids)} proteins")

with open("submission_naive.tsv", "w") as f:
    for pid in test_ids:
        for aspect in ["P", "C", "F"]:
            for term, freq in top_terms[aspect]:
                f.write(f"{pid}\t{term}\t{freq:.3f}\n")

print("Created submission_naive.tsv!")

Predict 224309 proteins
Created submission_naive.tsv!


In [24]:
result = pd.read_csv('/workspace/notebooks/submission_naive.tsv', sep='\t')

In [27]:
result.head(15)

Unnamed: 0,A0A0C5B5G6,GO:0045944,0.009
0,A0A0C5B5G6,GO:0000122,0.006
1,A0A0C5B5G6,GO:0006355,0.006
2,A0A0C5B5G6,GO:0045893,0.005
3,A0A0C5B5G6,GO:0045892,0.004
4,A0A0C5B5G6,GO:0010628,0.003
5,A0A0C5B5G6,GO:0006357,0.003
6,A0A0C5B5G6,GO:0007165,0.003
7,A0A0C5B5G6,GO:0006974,0.003
8,A0A0C5B5G6,GO:0043066,0.003
9,A0A0C5B5G6,GO:0008284,0.003


## Result