In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import polars as pl
import duckdb as dd
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
import pickle
import gc
import ctypes
from pathlib import Path
import logging
import json
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor
import datetime
from typing import Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

import cuml
import cudf
from cuml.linear_model import LogisticRegression as cuCML_LogisticRegression
import cupy as cp
from sklearn.multioutput import MultiOutputClassifier
from cuml.model_selection import train_test_split
# Optional: for evaluation
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from cuml.preprocessing import StandardScaler

In [2]:
print(f"Polars version: {pl.__version__}")
print(f"CuPy version: {cp.__version__}")

try:
    #import cuml
    print(f"cuML version: {cuml.__version__}")
    print(f"cuML version imported successfully.")
    
except ImportError as e:
    print(f"cuML could not be imported. Ensure RAPIDS is installed correctly. Error: {e}")
    # If Cuml cannot be imported, the rest of the notebook will not work.
    # In this case, it may make sense to stop Execution.
    raise

Polars version: 1.25.0
CuPy version: 13.6.0
cuML version: 25.02.01
cuML version imported successfully.


In [None]:
try:
    gpu_array = cp.arange(10)
    print(f"CuPy array on this device: {gpu_array.device}")
except cp.cuda.runtime.CUDARuntimeError as e:
    print(f"CuPy device can't be started: {e}")
    print("Make sure your CUDA drivers and cuPy installation are correct.")

## Setup the test data on which predictions need to be generated

In [None]:
df_test = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/test_protein_features_esm2_480.parquet')
print("Shape of test features", df_test.shape)
print(df_test.head(5))

prots_for_submission = np.array(pl.Series(df_test.select(pl.col('protein_accession_id'))).to_list())
print("prots_for_submission -- ", prots_for_submission.shape)

submission_embed_np_array = df_test['embedding_arrays'].to_numpy().astype(np.float32)
print(f"NumPy array shape: {submission_embed_np_array.shape}")
print(f"NumPy array dtype: {submission_embed_np_array.dtype}")

submission_embeds_cp_array = cp.array(submission_embed_np_array)
print(f"CuPy array shape: {submission_embeds_cp_array.shape}")
print(f"CuPy array dtype: {submission_embeds_cp_array.dtype}")
print(f"CuPy array device: {submission_embeds_cp_array.device}")

## Run the training pipeline

In [None]:
features_path = '/kaggle/input/cafa6-protein-labels-features-depth-based-suman/train_protein_features_cc_6.parquet'
labels_path = '/kaggle/input/cafa6-protein-labels-features-depth-based-suman/train_protein_labels_cc_6.parquet'
weights_path = '/kaggle/input/cafa-6-protein-function-prediction/IA.tsv'
training_iterations = 2500
c_value = 0.01
proba_threshold = 0.75
submission_filename = 'submission_df_cc_6.tsv'

def training_pipeline(features_path, labels_path, weights_path, training_iterations, c_value, proba_threshold, submission_filename):
    df_train_features = pl.read_parquet(features_path)
    print("Shape of training features", df_train_features.shape)
    print(df_train_features.head(5))

    df_train_labels = pl.read_parquet(labels_path)
    print("Shape of training labels", df_train_labels.shape)
    print(df_train_labels.head(5))

    label_cols = list(filter(lambda x: x != 'protein_accession_id', df_train_labels.columns))
    print("length of label_cols -- ", len(label_cols))

    df_weights = pl.read_csv(weights_path, separator="\t")
    df_weights.columns = ['go_term', 'ia']
    print("Shape of IA data", df_weights.shape)

    df_weights_filtered = df_weights.filter(pl.col('go_term').is_in(label_cols))
    print("shape of df_weights_filtered -- ", df_weights_filtered.shape)
    
    labels = df_weights_filtered.shape[0]
    class_wt_dict = {}
    
    for i in range(labels):
        class_wt_dict[df_weights_filtered.item(i,0)] = round(df_weights_filtered.item(i,1),5)

    embed_np_array = df_train_features['protein_embedding'].to_numpy().astype(np.float32)
    print(f"NumPy array shape: {embed_np_array.shape}")
    print(f"NumPy array dtype: {embed_np_array.dtype}")

    embeds_cp_array = cp.array(embed_np_array)
    print(f"CuPy array shape: {embeds_cp_array.shape}")
    print(f"CuPy array dtype: {embeds_cp_array.dtype}")
    print(f"CuPy array device: {embeds_cp_array.device}")

    go_terms_cp_array = cp.array(df_train_labels.select(label_cols).to_numpy())
    valid_rows_mask = (go_terms_cp_array != 0).any(axis=1)
    go_terms_cp_array_final = go_terms_cp_array[valid_rows_mask]
    print(f"CuPy array shape: {go_terms_cp_array_final.shape}")
    print(f"CuPy array dtype: {go_terms_cp_array_final.dtype}")
    print(f"CuPy array device: {go_terms_cp_array_final.device}")

    embeds_cp_array_final = embeds_cp_array[valid_rows_mask]
    print(f"CuPy array shape: {embeds_cp_array_final.shape}")
    print(f"CuPy array dtype: {embeds_cp_array_final.dtype}")
    print(f"CuPy array device: {embeds_cp_array_final.device}")

    scaler = StandardScaler()

    embeds_cp_array_final_scaled = scaler.fit_transform(embeds_cp_array_final)
    submission_embeds_cp_array_scaled = scaler.fit_transform(submission_embeds_cp_array)

    print("standard scaling of embeddings completed ....")

    label_weight_vector = cp.array([class_wt_dict.get(goterm, 0) for goterm in label_cols])

    # 2. Multiply the binary label matrix by the weight vector
    # This gives a 1D weight for every sample based on its positive GO terms
    sample_weights = go_terms_cp_array_final.dot(label_weight_vector)
    
    # 3. Normalize (Best practice: ensures the average weight is 1.0)
    sample_weights = sample_weights / np.mean(sample_weights)

    print("normalisation of weights completed .... ", sample_weights.shape)

    base_model = cuCML_LogisticRegression(solver='qn', max_iter=training_iterations, output_type='numpy', C=c_value, fit_intercept=False)

    multilabel_model = MultiOutputClassifier(base_model)
    
    print("Training model ...")
    multilabel_model.fit(embeds_cp_array_final_scaled.get(), go_terms_cp_array_final.get(), sample_weight=sample_weights.astype('float32'))
    print("Training complete ...")
    
    predictions_gpu = multilabel_model.predict_proba(submission_embeds_cp_array_scaled.get())
    prob_positive = np.transpose([p[:, 1] for p in predictions_gpu])
    print("predictions complete ... ", prob_positive.shape)

    predictions_df_pl = pl.DataFrame(
        prob_positive, # Ensure data is a numpy array
        schema=label_cols # Assign the column names
    )
    
    predictions_df_pl = predictions_df_pl.with_columns(
        pl.Series(name="protein_accession_id", values=prots_for_submission)
    )
    print("finished building the predictions_df_pl ... ", predictions_df_pl.shape)
    
    long_format_df = predictions_df_pl.melt(
        id_vars=["protein_accession_id"],          # Column to keep as identifier
        value_vars=label_cols,                  # Columns to melt into rows
        variable_name="go_term",                   # Name for the column containing GO terms
        value_name="probability"                   # Name for the column containing scores
    )
    long_format_df = long_format_df.filter(pl.col("probability") > proba_threshold)
    print("finished building the submission format ... ", long_format_df.shape)
    print(dd.sql("select count(distinct(protein_accession_id)), count(distinct(go_term)) from long_format_df").pl())
    
    long_format_df.write_csv(submission_filename, separator="\t", include_header=False)

## Combining all the submission datasets into one

In [None]:
submission_df_cc_concat = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_cc_concat.tsv', separator="\t")
submission_df_cc_concat.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_cc_concat.shape)

submission_df_cc_2 = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_cc_2.tsv', separator="\t")
submission_df_cc_2.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_cc_2.shape)

submission_df_cc_3 = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_cc_3.tsv', separator="\t")
submission_df_cc_3.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_cc_3.shape)

submission_df_cc_5 = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_cc_5.tsv', separator="\t")
submission_df_cc_5.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_cc_5.shape)

submission_df_cc_4 = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_cc_4.tsv', separator="\t")
submission_df_cc_4.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_cc_4.shape)

submission_df_cc_6 = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_cc_6.tsv', separator="\t")
submission_df_cc_6.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_cc_6.shape)

submission_df_cc = pl.concat([submission_df_cc_concat, submission_df_cc_6
                          , submission_df_cc_4, submission_df_cc_5, submission_df_cc_3, submission_df_cc_2])
print(submission_df_cc.shape)

unique_proteins_cc = submission_df_cc.select("protein").unique()

new_rows_cc = unique_proteins_cc.with_columns(
    GO_Term = pl.lit("GO:0005575"),
    Probability = pl.lit(0.999999)
)

submission_df_cc_w_root = pl.concat([submission_df_cc, new_rows_cc])
print(submission_df_cc_w_root.shape)

submission_df_mf = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_mf.tsv', separator="\t")
submission_df_mf.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_mf.shape)

unique_proteins_mf = submission_df_mf.select("protein").unique()

new_rows_mf = unique_proteins_mf.with_columns(
    GO_Term = pl.lit("GO:0003674"),
    Probability = pl.lit(0.999999)
)

submission_df_mf_w_root = pl.concat([submission_df_mf, new_rows_mf])
print(submission_df_mf_w_root.shape)

submission_df_bp = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_bp.tsv', separator="\t")
submission_df_bp.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_bp.shape)

unique_proteins_bp = submission_df_bp.select("protein").unique()

new_rows_bp = unique_proteins_bp.with_columns(
    GO_Term = pl.lit("GO:0008150"),
    Probability = pl.lit(0.999999)
)

submission_df_bp_w_root = pl.concat([submission_df_bp, new_rows_bp])
print(submission_df_bp_w_root.shape)

submission_df = pl.concat([submission_df_mf_w_root, submission_df_cc_w_root, submission_df_bp_w_root])
print(submission_df.shape)

print(dd.sql("select count(distinct(protein)) as proteins, count(distinct(GO_Term)) as GO_Terms, from submission_df").pl())
submission_df.write_csv("submission.tsv", separator="\t", include_header=False)

## Miscellaneous operations

In [None]:
terms_with_max_wts = (
    df_weights_filtered.top_k(35, by="ia")
    .get_column("go_term")
    .to_list()
)

In [None]:
label_cols_subset = label_cols[0:5]
print(len(label_cols_subset))

In [None]:
counts = cp.sum(go_terms_cp_array_final, axis=0)

# Transfer only the small summary back to CPU for printing
print("Positive counts per label:", counts.get().astype(int))

In [None]:
empty_check = (go_terms_cp_array_final.sum(axis=0) == 0).any()
if empty_check:
    print("Warning: One of the 5 selected labels has no positive samples in this subset.")

In [None]:
column_sums = np.sum(go_terms_cp_array_final, axis=1)

# Identify indices of empty labels (where sum is zero)
empty_indices = np.where(column_sums == 0)[0]
len(empty_indices)

In [None]:
X_train_gpu, X_test_gpu, y_train_gpu, y_test_gpu = train_test_split(
    embeds_cp_array, go_terms_cp_array, test_size=0.1, random_state=42
)

print("\nShapes of train and test sets(GPU):")
print(f"X_train_gpu: {X_train_gpu.shape}")
print(f"X_test_gpu: {X_test_gpu.shape}")
print(f"y_train_gpu: {y_train_gpu.shape}")
print(f"y_test_gpu: {y_test_gpu.shape}")

In [None]:
from sklearn.metrics import make_scorer, f1_score

param_grid = {
    'estimator__C': [0.1, 1.0, 10.0],           # Regularization strength
    'estimator__penalty': ['l2', 'none'],      # Penalty type
    'estimator__tol': [1e-4, 1e-3]             # Tolerance for stopping
}

# Create a scorer that suppresses the warning by setting f1 to 0.0 for zero-division cases
weighted_f1 = make_scorer(f1_score, average='weighted', zero_division=0)

# 3. Initialize GridSearchCV
# Use 'f1_macro' or 'f1_weighted' as GO terms are often highly imbalanced
grid_search = GridSearchCV(
    multilabel_model, 
    param_grid, 
    cv=5, 
    scoring=weighted_f1, 
    n_jobs=1
)

print("Training models (Grid Search)...")
# 4. Fit the grid search

grid_search.fit(embeds_cp_array_final.get(), go_terms_cp_array_final.get())

print("Training complete (Grid Search).")
# 5. Retrieve the best parameters
print("Optimal Hyperparameters:", grid_search.best_params_)

predictions_gpu = grid_search.predict_proba(submission_embeds_cp_array.get())

In [100]:
long_format_df.write_parquet("submission_df_cc_4.parquet")

In [96]:
import gzip

file_path = 'submission_df_cc_4.tsv.gz'

# Open the file using gzip.open in write-bytes mode ('wb')
with gzip.open(file_path, 'wb') as f:
    # Write the DataFrame to the file object
    long_format_df.write_csv(f, separator="\t", include_header=False)

OSError: 