In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import polars as pl
import duckdb as dd
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
import pickle
import gc
import ctypes
from pathlib import Path
import logging
import json
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor
import datetime
from typing import Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

import cuml
import cudf
from cuml.linear_model import LogisticRegression as cuCML_LogisticRegression
import cupy as cp
from sklearn.multioutput import MultiOutputClassifier
from cuml.model_selection import train_test_split
# Optional: for evaluation
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
print(f"Polars version: {pl.__version__}")
print(f"CuPy version: {cp.__version__}")

try:
    #import cuml
    print(f"cuML version: {cuml.__version__}")
    print(f"cuML version imported successfully.")
    
except ImportError as e:
    print(f"cuML could not be imported. Ensure RAPIDS is installed correctly. Error: {e}")
    # If Cuml cannot be imported, the rest of the notebook will not work.
    # In this case, it may make sense to stop Execution.
    raise

Polars version: 1.25.0
CuPy version: 13.6.0
cuML version: 25.02.01
cuML version imported successfully.


In [21]:
df_train_features = pl.read_parquet('/kaggle/input/cafa6-protein-labels-features-depth-based-suman/train_protein_features_cc_6.parquet')
print("Shape of training features", df_train_features.shape)
print(df_train_features.head(5))

df_train_labels = pl.read_parquet('/kaggle/input/cafa6-protein-labels-features-depth-based-suman/train_protein_labels_cc_6.parquet')
print("Shape of training labels", df_train_labels.shape)
print(df_train_labels.head(5))

label_cols = list(filter(lambda x: x != 'protein_accession_id', df_train_labels.columns))
print("length of label_cols -- ", len(label_cols))

df_weights = pl.read_csv("/kaggle/input/cafa-6-protein-function-prediction/IA.tsv", separator="\t")
df_weights.columns = ['go_term', 'ia']
print("Shape of IA data", df_weights.shape)

df_weights_filtered = df_weights.filter(pl.col('go_term').is_in(label_cols))
print("shape of df_weights_filtered -- ", df_weights_filtered.shape)

labels = df_weights_filtered.shape[0]
class_wt_dict = {}

for i in range(labels):
    class_wt_dict[df_weights_filtered.item(i,0)] = round(df_weights_filtered.item(i,1),5)

Shape of training features (3470, 2)
shape: (5, 2)
┌──────────────────────┬─────────────────────────────────┐
│ protein_accession_id ┆ protein_embedding               │
│ ---                  ┆ ---                             │
│ str                  ┆ array[f32, 480]                 │
╞══════════════════════╪═════════════════════════════════╡
│ P42528               ┆ [-0.203857, -0.016296, … 0.094… │
│ Q8TBZ6               ┆ [-0.162231, 0.156372, … -0.163… │
│ O43166               ┆ [-0.025772, -0.051117, … 0.108… │
│ E9PVX6               ┆ [-0.033752, -0.024628, … -0.10… │
│ O01479               ┆ [-0.146973, 0.11853, … -0.0066… │
└──────────────────────┴─────────────────────────────────┘
Shape of training labels (3470, 134)
shape: (5, 134)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ protein_a ┆ GO:001563 ┆ GO:007268 ┆ GO:000587 ┆ … ┆ GO:003012 ┆ GO:003643 ┆ GO:003311 ┆ GO:00466 │
│ ccession_ ┆ 0         ┆ 6         ┆ 4      

In [21]:
terms_with_max_wts = (
    df_weights_filtered.top_k(35, by="ia")
    .get_column("go_term")
    .to_list()
)

In [4]:
try:
    gpu_array = cp.arange(10)
    print(f"CuPy array on this device: {gpu_array.device}")
except cp.cuda.runtime.CUDARuntimeError as e:
    print(f"CuPy device can't be started: {e}")
    print("Make sure your CUDA drivers and cuPy installation are correct.")

CuPy array on this device: <CUDA Device 0>


## Setup the test data on which predictions need to be generated

In [5]:
df_test = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/test_protein_features_esm2_480.parquet')
print("Shape of test features", df_test.shape)
print(df_test.head(5))

prots_for_submission = np.array(pl.Series(df_test.select(pl.col('protein_accession_id'))).to_list())
print("prots_for_submission -- ", prots_for_submission.shape)

submission_embed_np_array = df_test['embedding_arrays'].to_numpy().astype(np.float32)
print(f"NumPy array shape: {submission_embed_np_array.shape}")
print(f"NumPy array dtype: {submission_embed_np_array.dtype}")

submission_embeds_cp_array = cp.array(submission_embed_np_array)
print(f"CuPy array shape: {submission_embeds_cp_array.shape}")
print(f"CuPy array dtype: {submission_embeds_cp_array.dtype}")
print(f"CuPy array device: {submission_embeds_cp_array.device}")

Shape of test features (224309, 2)
shape: (5, 2)
┌──────────────────────┬─────────────────────────────────┐
│ protein_accession_id ┆ embedding_arrays                │
│ ---                  ┆ ---                             │
│ str                  ┆ array[f32, 480]                 │
╞══════════════════════╪═════════════════════════════════╡
│ Q8LX40               ┆ [-0.194824, -0.071106, … -0.33… │
│ Q8MHW5               ┆ [-0.164673, 0.078003, … -0.001… │
│ O00257               ┆ [-0.026779, -0.084534, … -0.02… │
│ A4FUD9               ┆ [-0.067444, -0.020065, … 0.046… │
│ Q9H1K1               ┆ [-0.268066, -0.078613, … -0.04… │
└──────────────────────┴─────────────────────────────────┘
prots_for_submission --  (224309,)
NumPy array shape: (224309, 480)
NumPy array dtype: float32
CuPy array shape: (224309, 480)
CuPy array dtype: float32
CuPy array device: <CUDA Device 0>


## Setup the training pipeline

In [22]:
embed_np_array = df_train_features['protein_embedding'].to_numpy().astype(np.float32)
print(f"NumPy array shape: {embed_np_array.shape}")
print(f"NumPy array dtype: {embed_np_array.dtype}")

NumPy array shape: (3470, 480)
NumPy array dtype: float32


In [23]:
embeds_cp_array = cp.array(embed_np_array)
print(f"CuPy array shape: {embeds_cp_array.shape}")
print(f"CuPy array dtype: {embeds_cp_array.dtype}")
print(f"CuPy array device: {embeds_cp_array.device}")

CuPy array shape: (3470, 480)
CuPy array dtype: float32
CuPy array device: <CUDA Device 0>


### Divide the GO Terms further to reduce the number of classes to predict

In [8]:
label_cols_subset = label_cols[0:5]
print(len(label_cols_subset))

5


In [24]:
go_terms_cp_array = cp.array(df_train_labels.select(label_cols).to_numpy())
valid_rows_mask = (go_terms_cp_array != 0).any(axis=1)
go_terms_cp_array_final = go_terms_cp_array[valid_rows_mask]
print(f"CuPy array shape: {go_terms_cp_array_final.shape}")
print(f"CuPy array dtype: {go_terms_cp_array_final.dtype}")
print(f"CuPy array device: {go_terms_cp_array_final.device}")

CuPy array shape: (3470, 133)
CuPy array dtype: float64
CuPy array device: <CUDA Device 0>


In [33]:
counts = cp.sum(go_terms_cp_array_final, axis=0)

# Transfer only the small summary back to CPU for printing
print("Positive counts per label:", counts.get().astype(int))

Positive counts per label: [1222   15    1    7   66  195   16    7   23  111   10    2    2   28
    9    7   41   40    2    2    1    2    6    2    5    1   10    3
    6   11   22   87   24    3   22    1    2   10    4    9    2   13
    7    2    2    3    4    3    4    2    6    7    4    1    2    3
    4    3    1    1    1    2    2    1    1    2    2    1    1    1]


In [34]:
empty_check = (go_terms_cp_array_final.sum(axis=0) == 0).any()
if empty_check:
    print("Warning: One of the 5 selected labels has no positive samples in this subset.")

In [25]:
embeds_cp_array_final = embeds_cp_array[valid_rows_mask]
print(f"CuPy array shape: {embeds_cp_array_final.shape}")
print(f"CuPy array dtype: {embeds_cp_array_final.dtype}")
print(f"CuPy array device: {embeds_cp_array_final.device}")

CuPy array shape: (3470, 480)
CuPy array dtype: float32
CuPy array device: <CUDA Device 0>


In [21]:
column_sums = np.sum(go_terms_cp_array_final, axis=1)

# Identify indices of empty labels (where sum is zero)
empty_indices = np.where(column_sums == 0)[0]
len(empty_indices)

0

In [14]:
empty_indices[702]

array(2011)

In [15]:
go_terms_cp_array[2011]

array([0., 0., 0., 0., 0.])

In [38]:
df_train_labels.row(index=35)

('O01427',
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0)

In [58]:
X_train_gpu, X_test_gpu, y_train_gpu, y_test_gpu = train_test_split(
    embeds_cp_array, go_terms_cp_array, test_size=0.1, random_state=42
)

print("\nShapes of train and test sets(GPU):")
print(f"X_train_gpu: {X_train_gpu.shape}")
print(f"X_test_gpu: {X_test_gpu.shape}")
print(f"y_train_gpu: {y_train_gpu.shape}")
print(f"y_test_gpu: {y_test_gpu.shape}")


Shapes of train and test sets(GPU):
X_train_gpu: (29487, 480)
X_test_gpu: (3276, 480)
y_train_gpu: (29487, 100)
y_test_gpu: (3276, 100)


In [26]:
# 1. Create a weight vector for the labels in the correct column order
# (Assuming Y_train columns match mlb.classes_)
label_weight_vector = cp.array([class_wt_dict.get(goterm, 0) for goterm in label_cols])

# 2. Multiply the binary label matrix by the weight vector
# This gives a 1D weight for every sample based on its positive GO terms
sample_weights = go_terms_cp_array_final.dot(label_weight_vector)

# 3. Normalize (Best practice: ensures the average weight is 1.0)
sample_weights = sample_weights / np.mean(sample_weights)

In [27]:
sample_weights.shape

(3470,)

In [28]:
base_model = cuCML_LogisticRegression(solver='qn', max_iter=2500, output_type='numpy')

multilabel_model = MultiOutputClassifier(base_model)

print("Training model ...")
multilabel_model.fit(embeds_cp_array_final.get(), go_terms_cp_array_final.get(), sample_weight=sample_weights.astype('float32'))
print("Training complete ...")

predictions_gpu = multilabel_model.predict_proba(submission_embeds_cp_array.get())
print("predictions complete ...")

Training model ...
Training complete ...
predictions complete ...


In [26]:
from sklearn.metrics import make_scorer, f1_score

param_grid = {
    'estimator__C': [0.1, 1.0, 10.0],           # Regularization strength
    'estimator__penalty': ['l2', 'none'],      # Penalty type
    'estimator__tol': [1e-4, 1e-3]             # Tolerance for stopping
}

# Create a scorer that suppresses the warning by setting f1 to 0.0 for zero-division cases
weighted_f1 = make_scorer(f1_score, average='weighted', zero_division=0)

# 3. Initialize GridSearchCV
# Use 'f1_macro' or 'f1_weighted' as GO terms are often highly imbalanced
grid_search = GridSearchCV(
    multilabel_model, 
    param_grid, 
    cv=5, 
    scoring=weighted_f1, 
    n_jobs=1
)

print("Training models (Grid Search)...")
# 4. Fit the grid search

grid_search.fit(embeds_cp_array_final.get(), go_terms_cp_array_final.get())

print("Training complete (Grid Search).")
# 5. Retrieve the best parameters
print("Optimal Hyperparameters:", grid_search.best_params_)

predictions_gpu = grid_search.predict_proba(submission_embeds_cp_array.get())

Training models (Grid Search)...


TypeError: unsupported operand type(s) for *: 'NoneType' and 'float'

In [41]:
type(predictions_gpu[0])

numpy.ndarray

In [29]:
prob_positive = np.transpose([p[:, 1] for p in predictions_gpu])

In [30]:
prob_positive.shape

(224309, 133)

In [31]:
predictions_df_pl = pl.DataFrame(
    prob_positive, # Ensure data is a numpy array
    schema=label_cols # Assign the column names
)

In [32]:
predictions_df_pl.head(5)

GO:0015630,GO:0072686,GO:0005874,GO:0005918,GO:0043073,GO:0000329,GO:0045111,GO:0035267,GO:0005923,GO:0097513,GO:0097013,GO:0005765,GO:0005884,GO:0015629,GO:0000228,GO:0000803,GO:0046540,GO:0034705,GO:1903143,GO:1990467,GO:0005876,GO:0031472,GO:0005700,GO:0014704,GO:0032010,GO:0030670,GO:0072687,GO:0030669,GO:0097729,GO:0034704,GO:1990468,GO:0062040,GO:0017146,GO:0005701,GO:0032281,GO:0012511,GO:0005882,…,GO:0072517,GO:0097471,GO:0036437,GO:0005880,GO:0071914,GO:0020036,GO:0032176,GO:0005917,GO:0098998,GO:0097457,GO:0071751,GO:0071750,GO:0016008,GO:0072493,GO:0035062,GO:0048555,GO:0044164,GO:0097233,GO:0005633,GO:0099012,GO:0019034,GO:0032173,GO:0097234,GO:0097728,GO:0097691,GO:1990357,GO:0005767,GO:0106033,GO:0044308,GO:0044174,GO:0150017,GO:0038038,GO:0048353,GO:0030128,GO:0036436,GO:0033115,GO:0046611
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.005728,0.028648,0.000854,0.00786,0.008582,1.5274e-09,0.063453,0.006694,0.00134,0.001867,0.001142,0.001336,0.009393,0.592394,0.000897,0.002687,0.000844,0.004955,1.6e-05,0.000111,0.001262,0.001773,0.011408,0.019702,0.001266,0.006367,0.00805,0.000915,0.00892,0.005958,0.000166,9.4021e-10,0.014999,0.000938,0.007719,0.004408,0.003878,…,1.1353e-08,1.1e-05,0.00233,0.000219,0.003234,0.000326,0.00056,0.000605,1.1258e-09,0.07561,0.000182,0.000303,0.001919,0.00024,0.000776,0.000526,0.002178,0.000654,0.000601,0.005551,1.2545e-08,0.002216,1.1281e-09,0.003321,0.004309,0.001146,0.001912,0.000375,0.000851,0.00045,0.000433,0.000653,0.003831,5.0465e-09,0.001552,1.7318e-07,0.002167
0.028278,0.04296,0.017862,0.00569,0.102892,1.5274e-09,0.037893,0.000812,0.021326,0.000296,0.001359,0.001532,0.012642,0.158843,0.023605,0.002626,0.000118,0.004297,0.000107,0.000388,0.017321,0.001682,0.036506,0.064122,0.000656,0.002205,0.072054,0.000181,0.006565,0.008522,0.000398,9.4017e-10,0.004146,0.001351,0.024842,0.002283,0.010073,…,1.135e-08,6.5e-05,0.000267,0.006836,0.001905,0.003006,0.000315,0.000958,1.1259e-09,0.000638,0.000336,0.000272,8.8e-05,0.000294,0.000207,0.104093,0.002225,0.000528,5.2e-05,0.000885,1.2542e-08,0.000184,1.1282e-09,0.002353,0.000415,0.013099,7.1e-05,0.000153,0.001236,0.00016,0.000106,0.000997,0.000135,5.0461e-09,0.000223,1.7283e-07,7.2e-05
0.023011,0.127236,0.017261,0.020811,0.002402,1.5275e-09,0.009838,0.000733,0.03114,0.00037,0.001282,0.001782,0.014211,0.182234,0.031039,0.002631,6e-05,0.005484,0.000184,0.000522,0.01716,0.001741,0.057817,0.012132,0.00401,0.001315,0.073434,0.000238,0.003338,0.027259,0.00052,9.4037e-10,0.006232,2.9e-05,0.005723,0.04057,0.015088,…,1.1352e-08,0.000103,0.000853,0.006338,0.000133,0.000337,0.001015,0.000101,1.1258e-09,0.006177,0.000343,0.000148,4.5e-05,0.000329,0.002702,0.000763,0.00034,0.000304,0.000138,4e-06,1.2544e-08,0.000181,1.1282e-09,0.000101,0.00436,0.000914,2.2e-05,0.000373,0.001554,0.004389,8.4e-05,0.00028,0.000196,5.0465e-09,0.000559,1.7345e-07,7e-06
0.092973,0.124867,0.01819,0.010311,0.010722,1.5275e-09,0.073476,0.000635,0.012531,0.000421,0.000928,0.002909,0.011382,0.099619,0.028372,0.000431,0.0001,0.004965,0.000149,0.000441,0.048412,0.001048,0.064341,0.014727,0.004597,0.004122,0.022663,0.000119,0.009175,0.008892,0.000444,9.4033e-10,0.010895,0.000735,0.012932,0.016649,0.006277,…,1.1353e-08,0.006329,0.000443,0.002483,0.000863,0.001843,0.000338,0.000211,1.1258e-09,8.3e-05,0.000343,0.000231,9.1e-05,0.000238,0.001842,0.000871,0.001759,0.000319,4.1e-05,3.5e-05,1.2545e-08,0.000172,1.1282e-09,0.004579,0.002559,0.01562,4e-05,0.00252,0.003479,0.000445,0.000107,0.000607,0.000598,5.0463e-09,0.00033,1.7312e-07,3e-05
0.025881,0.064498,0.013481,0.003462,0.014724,1.5274e-09,0.018489,0.001447,0.015242,0.003027,0.000479,0.005564,0.060663,0.301688,0.06713,0.000177,0.000943,0.009154,1.3e-05,0.000152,0.005334,0.003038,0.004105,0.014698,0.04478,0.001653,0.022753,0.001332,0.003945,0.003899,0.000224,9.4017e-10,0.007799,0.000559,0.006759,0.017079,0.054745,…,1.1353e-08,1.8e-05,0.000726,0.011827,0.009933,0.000319,0.000955,0.00022,1.1258e-09,0.000556,0.000123,0.000145,0.001321,0.000529,0.000367,5.5e-05,0.000166,0.001471,0.00078,0.000626,1.2545e-08,0.001309,1.1282e-09,0.000838,0.001359,0.0033,0.001513,0.000492,0.000336,0.00079,0.000462,0.001518,0.000336,5.0463e-09,0.00046,1.7297e-07,0.002366


In [33]:
predictions_df_pl = predictions_df_pl.with_columns(
    pl.Series(name="protein_accession_id", values=prots_for_submission)
)

In [34]:
long_format_df = predictions_df_pl.melt(
    id_vars=["protein_accession_id"],          # Column to keep as identifier
    value_vars=label_cols,                  # Columns to melt into rows
    variable_name="go_term",                   # Name for the column containing GO terms
    value_name="probability"                   # Name for the column containing scores
)
long_format_df = long_format_df.filter(pl.col("probability") > 0.55)
long_format_df.shape

(8320, 3)

In [35]:
dd.sql("select count(distinct(protein_accession_id)), count(distinct(go_term)) from long_format_df").pl()

count(DISTINCT protein_accession_id),count(DISTINCT go_term)
i64,i64
7948,54


In [36]:
long_format_df.write_csv("submission_df_cc_6.tsv", separator="\t", include_header=False)

In [37]:
submission_df_cc_concat = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_cc_concat.tsv', separator="\t")
submission_df_cc_concat.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_cc_concat.shape)

submission_df_cc_2 = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_cc_2.tsv', separator="\t")
submission_df_cc_2.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_cc_2.shape)

submission_df_cc_3 = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_cc_3.tsv', separator="\t")
submission_df_cc_3.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_cc_3.shape)

submission_df_cc_5 = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_cc_5.tsv', separator="\t")
submission_df_cc_5.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_cc_5.shape)

submission_df_cc_4 = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_cc_4.tsv', separator="\t")
submission_df_cc_4.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_cc_4.shape)

submission_df_cc_6 = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_cc_6.tsv', separator="\t")
submission_df_cc_6.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_cc_6.shape)

submission_df_mf = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_mf.tsv', separator="\t")
submission_df_mf.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_mf.shape)

submission_df_bp = pl.read_csv('/kaggle/input/suman-cafa6-submission-df-all/submission_df_bp.tsv', separator="\t")
submission_df_bp.columns = ['protein','GO_Term','Probability']
print("Shape - ", submission_df_bp.shape)

Shape -  (207897, 3)
Shape -  (11524, 3)
Shape -  (3282, 3)
Shape -  (134352, 3)
Shape -  (3521, 3)
Shape -  (8319, 3)
Shape -  (144015, 3)
Shape -  (23260, 3)


In [38]:
submission_df = pl.concat([submission_df_mf, submission_df_cc_concat
                           , submission_df_bp, submission_df_cc_6
                          , submission_df_cc_4, submission_df_cc_5, submission_df_cc_3
                          , submission_df_cc_2])
submission_df.shape

(536170, 3)

In [39]:
dd.sql("select count(distinct(protein)) as proteins, count(distinct(GO_Term)) as GO_Terms, \
from submission_df").pl()

proteins,GO_Terms
i64,i64
222502,3067


In [40]:
submission_df.write_csv("submission.tsv", separator="\t", include_header=False)