In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import pickle
import obonet
from tqdm.auto import tqdm

In [4]:
seq_2_terms = pd.read_parquet("/mnt/d/ML/Kaggle/CAFA6-new/data_packet1/seq_2_terms.parquet")
seq_2_terms

Unnamed: 0,qseqid,terms_predicted,terms_true
0,A0A023FBW4,"[GO:0019958, GO:0005576, GO:0043230, GO:001995...",[GO:0019958]
1,A0A023FBW7,"[GO:0019957, GO:0005576, GO:0035716, GO:000560...",[GO:0019957]
2,A0A023FDY8,"[GO:0019957, GO:0005576, GO:0035716, GO:000560...",[GO:0019957]
3,A0A023FF81,"[GO:0019958, GO:0005576, GO:0043230, GO:001995...",[GO:0019958]
4,A0A023FFB5,"[GO:0047387, GO:0005674, GO:0008989, GO:000583...",[GO:0019957]
...,...,...,...
82196,X2JI34,"[GO:0009625, GO:0016102, GO:0009717, GO:004708...","[GO:0106223, GO:0051762]"
82197,X4Y2L4,"[GO:0005765, GO:0009826, GO:0016798, GO:003021...","[GO:0033906, GO:0030214]"
82198,X5JA13,"[GO:0031267, GO:0000145, GO:0043066, GO:009754...","[GO:0009506, GO:0060321, GO:0000145, GO:000551..."
82199,X5JB51,"[GO:0031267, GO:0000145, GO:0043066, GO:009754...","[GO:0016020, GO:0009506, GO:0060321, GO:000014..."


In [5]:
train_terms_grouped = pd.read_csv("/mnt/d/ML/Kaggle/CAFA6-new/CAFA6_dataset_generator/train_terms_grouped.tsv", sep="\t")
train_terms_grouped['terms'] = train_terms_grouped['terms'].str.split(';').apply(set)
train_lens = train_terms_grouped['terms'].apply(len)
print(train_terms_grouped.shape)
print(f"Average number of terms per protein: {train_lens.mean():.2f}")
print(f"Max number of terms for a protein: {train_lens.max()}")
print(f"Max number of terms for a protein: {train_lens.max()}") 
train_terms_grouped.head()

(82404, 2)
Average number of terms per protein: 41.10
Max number of terms for a protein: 761
Max number of terms for a protein: 761


Unnamed: 0,entryID,terms
0,Q5W0B1,"{GO:0044238, GO:0051052, GO:0070647, GO:000548..."
1,Q3EC77,"{GO:0031985, GO:0005794, GO:0012505, GO:003198..."
2,Q8IZR5,"{GO:0005488, GO:0005515}"
3,Q8R2Z3,"{GO:1902025, GO:0015701, GO:0022600, GO:000659..."
4,P63027,"{GO:0030139, GO:0043307, GO:0030136, GO:004690..."


## Heirarchy proppgation

In [6]:

# ==========================================
# GO HIERARCHY UTILITIES
# ==========================================
from collections import defaultdict

def parse_obo_parents(go_obo_path):
    """Parse GO ontology OBO file and extract parent relationships"""
    print(f"[1/3] Parsing OBO Ontology...")
    term_parents = defaultdict(set)
    roots = set(['GO:0003674', 'GO:0008150', 'GO:0005575'])  # MF, BP, CC
    
    with open(go_obo_path, "r") as f:
        cur_id = None
        for line in f:
            line = line.strip()
            if line == "[Term]":
                cur_id = None
            elif line.startswith("id: "):
                cur_id = line.split("id: ")[1].strip()
            elif line.startswith("is_a: "):
                pid = line.split()[1].strip()
                if cur_id:
                    term_parents[cur_id].add(pid)
            elif line.startswith("relationship: part_of "):
                parts = line.split()
                if len(parts) >= 3:
                    pid = parts[2].strip()
                    if cur_id:
                        term_parents[cur_id].add(pid)
    return term_parents, roots


def get_ancestors_map(term_parents):
    """Build complete ancestors map for all terms"""
    print("[2/3] Building Ancestor Map...")
    ancestors = {}
    
    def get_all_ancestors(term):
        if term in ancestors:
            return ancestors[term]
        parents = term_parents.get(term, set())
        all_anc = set(parents)
        for p in parents:
            all_anc |= get_all_ancestors(p)
        ancestors[term] = all_anc
        return all_anc
    
    for term in tqdm(list(term_parents.keys()), desc="Building ancestors"):
        get_all_ancestors(term)
    return ancestors


# Load GO ontology
GO_OBO_PATH = '/mnt/d/ML/Kaggle/CAFA6/cafa-6-protein-function-prediction/Train/go-basic.obo'
term_parents, roots = parse_obo_parents(GO_OBO_PATH)
ancestors_map = get_ancestors_map(term_parents)
print(f"Loaded {len(ancestors_map)} GO terms from ontology")


[1/3] Parsing OBO Ontology...
[2/3] Building Ancestor Map...


Building ancestors:   0%|          | 0/40121 [00:00<?, ?it/s]

Loaded 40125 GO terms from ontology


In [7]:

# ==========================================
# APPLY GO HIERARCHY CONSTRAINT
# ==========================================
print("[3/3] Applying GO hierarchy constraint to terms_predicted...")

def propagate_go_hierarchy(terms_set, ancestors_map, roots):
    """Propagate GO hierarchy: add all ancestors of predicted terms"""
    if not isinstance(terms_set, set):
        terms_set = set(terms_set) if terms_set else set()
    
    final_terms = set(terms_set)
    
    # Add all ancestors for each predicted term
    for term in terms_set:
        if term in ancestors_map:
            final_terms.update(ancestors_map[term])
    
    # Optionally add root terms if any prediction exists
    if len(final_terms) > 0:
        final_terms.update(roots)
    
    return final_terms


# Apply hierarchy constraint to each row
seq_2_terms['terms_predicted'] = seq_2_terms['terms_predicted'].apply(
    lambda x: propagate_go_hierarchy(x, ancestors_map, roots)
)

print(f"✓ GO hierarchy constraint applied to {len(seq_2_terms)} proteins")

# Show statistics after propagation
terms_len_after = seq_2_terms['terms_predicted'].apply(len)
print(f"\nAfter hierarchy propagation:")
print(f"Average number of predicted terms per protein: {terms_len_after.mean():.2f}")
print(f"Max number of predicted terms for a protein: {terms_len_after.max()}")
print(f"Min number of predicted terms for a protein: {terms_len_after.min()}")


[3/3] Applying GO hierarchy constraint to terms_predicted...
✓ GO hierarchy constraint applied to 82201 proteins

After hierarchy propagation:
Average number of predicted terms per protein: 854.73
Max number of predicted terms for a protein: 1735
Min number of predicted terms for a protein: 347


## Check coverage

In [8]:
merged_df = pd.merge(seq_2_terms, train_terms_grouped, left_on='qseqid', right_on='entryID', how='inner')
merged_df.drop(columns=['entryID'], inplace=True)
merged_df = merged_df.rename(columns={'terms': 'train_terms_all'})
merged_df.head()

Unnamed: 0,qseqid,terms_predicted,terms_true,train_terms_all
0,A0A023FBW4,"{GO:0009966, GO:0030029, GO:0052005, GO:004689...",[GO:0019958],"{GO:0019958, GO:0019956, GO:0005515, GO:001995..."
1,A0A023FBW7,"{GO:0150005, GO:0009966, GO:0042716, GO:000683...",[GO:0019957],"{GO:0019957, GO:0019956, GO:0005515, GO:001995..."
2,A0A023FDY8,"{GO:0009966, GO:0160110, GO:0042127, GO:004271...",[GO:0019957],"{GO:0019957, GO:0019956, GO:0005515, GO:001995..."
3,A0A023FF81,"{GO:0042692, GO:0102158, GO:0009966, GO:004275...",[GO:0019958],"{GO:0019958, GO:0019956, GO:0005515, GO:001995..."
4,A0A023FFB5,"{GO:0001997, GO:0009966, GO:0000271, GO:004689...",[GO:0019957],"{GO:0019957, GO:0019956, GO:0005515, GO:001995..."


In [9]:
# Apply the check
coverage_results = merged_df.apply(check_coverage, axis=1)
coverage_df = pd.DataFrame(coverage_results.tolist())

# Combine with original data
analysis_df = pd.concat([merged_df, coverage_df], axis=1)

print("=== Coverage Analysis ===")
print(f"\nOverall Statistics:")
print(f"Average coverage: {coverage_df['coverage_pct'].mean():.2f}%")
print(f"Median coverage: {coverage_df['coverage_pct'].median():.2f}%")
print(f"Min coverage: {coverage_df['coverage_pct'].min():.2f}%")
print(f"Max coverage: {coverage_df['coverage_pct'].max():.2f}%")
print(f"\nRows with 100% coverage: {(coverage_df['coverage_pct'] == 100).sum()} / {len(coverage_df)}")
print(f"Rows with 0% coverage: {(coverage_df['coverage_pct'] == 0).sum()} / {len(coverage_df)}")
print(f"\nAverage missing terms per row: {coverage_df['missing_terms'].mean():.2f}")
print(f"Total train terms checked: {coverage_df['total_train_terms'].sum()}")
print(f"Total covered terms: {coverage_df['covered_terms'].sum()}")
print(f"Total missing terms: {coverage_df['missing_terms'].sum()}")

=== Coverage Analysis ===

Overall Statistics:
Average coverage: 97.20%
Median coverage: 100.00%
Min coverage: 0.00%
Max coverage: 100.00%

Rows with 100% coverage: 72159 / 82201
Rows with 0% coverage: 11 / 82201

Average missing terms per row: 1.07
Total train terms checked: 3384649
Total covered terms: 3296384
Total missing terms: 88265


In [None]:
merged_df['train_terms_all']  = merged_df['train_terms_all'].apply(list)

term_to_aspect = np.load( "/mnt/d/ML/Kaggle/CAFA6-new/data_packet1/go_term_to_aspect.npy", allow_pickle=True).item()

: 

In [None]:
#filter by aspect 
def filter_by_aspect(terms, aspect):
    filtered_terms = {term for term in terms if term_to_aspect.get(term) == aspect}
    return filtered_terms


aspects = ['P', 'C', 'F']

for aspect in aspects:

    print("############################################################")
    print(f"Filtering by aspect {aspect}")
    merged_df_copy = merged_df.copy()
    merged_df_copy[f'terms_predicted'] = merged_df_copy['terms_predicted'].apply(lambda x: filter_by_aspect(x, aspect))
    merged_df_copy[f'train_terms_all'] = merged_df_copy['train_terms_all'].apply(lambda x: filter_by_aspect(x, aspect))

    
    true_len = merged_df_copy['train_terms_all'].apply(len)
    pred_len = merged_df_copy['terms_predicted'].apply(len)

    zero_len_mask = (true_len == 0)
    merged_df_copy = merged_df_copy[~zero_len_mask]
    print(f"Average number of terms per protein: {true_len.mean():.2f}")
    print(f"Min number of terms for a protein: {true_len.min()}")
    print(f"Max number of terms for a protein: {true_len.max()}") 
    print(f"No. of proteins with zero terms: {(true_len==0).sum()} / {len(true_len)} \n")

    
    print(f"Average number of terms per protein: {pred_len.mean():.2f}")
    print(f"Min number of terms for a protein: {pred_len.min()}")
    print(f"Max number of terms for a protein: {pred_len.max()}") 


    # Apply the check
    coverage_results = merged_df_copy.apply(check_coverage, axis=1)
    coverage_df = pd.DataFrame(coverage_results.tolist())

    # Combine with original data
    analysis_df = pd.concat([merged_df_copy, coverage_df], axis=1)

    print("=== Coverage Analysis ===")
    print(f"\nOverall Statistics:")
    print(f"Average coverage: {coverage_df['coverage_pct'].mean():.2f}%")
    print(f"Median coverage: {coverage_df['coverage_pct'].median():.2f}%")
    print(f"Min coverage: {coverage_df['coverage_pct'].min():.2f}%")
    print(f"Max coverage: {coverage_df['coverage_pct'].max():.2f}%")
    print(f"\nRows with 100% coverage: {(coverage_df['coverage_pct'] == 100).sum()} / {len(coverage_df)}")
    print(f"Rows with 0% coverage: {(coverage_df['coverage_pct'] == 0).sum()} / {len(coverage_df)}")
    print(f"\nAverage missing terms per row: {coverage_df['missing_terms'].mean():.2f}")
    print(f"Total train terms checked: {coverage_df['total_train_terms'].sum()}")
    print(f"Total covered terms: {coverage_df['covered_terms'].sum()}")
    print(f"Total missing terms: {coverage_df['missing_terms'].sum()} \n\n")

    del coverage_df
    del merged_df_copy

############################################################
Filtering by aspect P
Average number of terms per protein: 25.97
Min number of terms for a protein: 0
Max number of terms for a protein: 653
No. of proteins with zero terms: 22340 / 82201 

Average number of terms per protein: 576.08
Min number of terms for a protein: 9
Max number of terms for a protein: 1378
=== Coverage Analysis ===

Overall Statistics:
Average coverage: 96.75%
Median coverage: 100.00%
Min coverage: 0.00%
Max coverage: 100.00%

Rows with 100% coverage: 52533 / 59861
Rows with 0% coverage: 14 / 59861

Average missing terms per row: 1.03
Total train terms checked: 2135163
Total covered terms: 2073482
Total missing terms: 61681 


############################################################
Filtering by aspect C


In [None]:
# Apply the check
coverage_results = merged_df.apply(check_coverage, axis=1)
coverage_df = pd.DataFrame(coverage_results.tolist())

# Combine with original data
analysis_df = pd.concat([merged_df, coverage_df], axis=1)

print("=== Coverage Analysis ===")
print(f"\nOverall Statistics:")
print(f"Average coverage: {coverage_df['coverage_pct'].mean():.2f}%")
print(f"Median coverage: {coverage_df['coverage_pct'].median():.2f}%")
print(f"Min coverage: {coverage_df['coverage_pct'].min():.2f}%")
print(f"Max coverage: {coverage_df['coverage_pct'].max():.2f}%")
print(f"\nRows with 100% coverage: {(coverage_df['coverage_pct'] == 100).sum()} / {len(coverage_df)}")
print(f"Rows with 0% coverage: {(coverage_df['coverage_pct'] == 0).sum()} / {len(coverage_df)}")
print(f"\nAverage missing terms per row: {coverage_df['missing_terms'].mean():.2f}")
print(f"Total train terms checked: {coverage_df['total_train_terms'].sum()}")
print(f"Total covered terms: {coverage_df['covered_terms'].sum()}")
print(f"Total missing terms: {coverage_df['missing_terms'].sum()}")

=== Coverage Analysis ===

Overall Statistics:
Average coverage: 41.31%
Median coverage: 50.00%
Min coverage: 0.00%
Max coverage: 100.00%

Rows with 100% coverage: 7349 / 82201
Rows with 0% coverage: 28145 / 82201

Average missing terms per row: 2.55
Total train terms checked: 467707
Total covered terms: 258290
Total missing terms: 209417


## ------Functions------------

In [1]:
import sys
def print_size(data):
    print(f'{sys.getsizeof(data) / (1024 * 1024):.2f} MB')

In [2]:
# Check if all true predictions from train_terms_all are present in terms_predicted
def check_coverage(row):
    """Check what percentage of train_terms_all are in terms_predicted"""
    # Check if train_terms_all is None or empty list
    true_key  = 'train_terms_all'
    if row[true_key] is None or (isinstance(row[true_key], list) and len(row[true_key]) == 0):
        return None
    
    train_set = set(row[true_key])
    pred_set = set(row['terms_predicted'])
    
    # Find terms in train_terms_all that are in terms_predicted
    covered_terms = train_set.intersection(pred_set)
    
    # Calculate coverage percentage
    coverage = len(covered_terms) / len(train_set) * 100 if len(train_set) > 0 else 0
    
    return {
        'total_train_terms': len(train_set),
        'covered_terms': len(covered_terms),
        'missing_terms': len(train_set) - len(covered_terms),
        'coverage_pct': coverage,
        'missing_term_list': list(train_set - covered_terms)
    }

