In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import polars as pl
import duckdb as dd
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
import pickle
import gc
import ctypes
from pathlib import Path
import logging
import json
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor
import datetime
from typing import Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

import cuml
import cudf
from cuml.linear_model import LogisticRegression as cuCML_LogisticRegression
import cupy as cp
from sklearn.multioutput import MultiOutputClassifier
from cuml.model_selection import train_test_split
# Optional: for evaluation
from sklearn.metrics import classification_report

In [2]:
print(f"Polars version: {pl.__version__}")
print(f"CuPy version: {cp.__version__}")

try:
    #import cuml
    print(f"cuML version: {cuml.__version__}")
    print(f"cuML version imported successfully.")
    
except ImportError as e:
    print(f"cuML could not be imported. Ensure RAPIDS is installed correctly. Error: {e}")
    # If Cuml cannot be imported, the rest of the notebook will not work.
    # In this case, it may make sense to stop Execution.
    raise

Polars version: 1.25.0
CuPy version: 13.6.0
cuML version: 25.02.01
cuML version could be imported successfully.


In [54]:
df_train_features = pl.read_parquet('/kaggle/input/cafa6-protein-labels-features-depth-based-suman/train_protein_features_cc_2.parquet')
print("Shape of training features", df_train_features.shape)
print(df_train_features.head(5))

df_train_labels = pl.read_parquet('/kaggle/input/cafa6-protein-labels-features-depth-based-suman/train_protein_labels_cc_2.parquet')
print("Shape of training labels", df_train_labels.shape)
print(df_train_labels.head(5))

Shape of training features (32763, 2)
shape: (5, 2)
┌──────────────────────┬─────────────────────────────────┐
│ protein_accession_id ┆ protein_embedding               │
│ ---                  ┆ ---                             │
│ str                  ┆ array[f32, 480]                 │
╞══════════════════════╪═════════════════════════════════╡
│ Q01083               ┆ [-0.126587, 0.162964, … -0.013… │
│ Q19286               ┆ [-0.042816, 0.025696, … -0.044… │
│ Q9HE16               ┆ [-0.035828, -0.030594, … 0.007… │
│ Q61315               ┆ [0.054382, -0.115173, … 0.0388… │
│ P97298               ┆ [-0.166748, -0.071289, … 0.007… │
└──────────────────────┴─────────────────────────────────┘
Shape of training labels (32763, 101)
shape: (5, 101)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ protein_a ┆ GO:001602 ┆ GO:000582 ┆ GO:000573 ┆ … ┆ GO:000177 ┆ GO:004517 ┆ GO:000950 ┆ GO:00324 │
│ ccession_ ┆ 0         ┆ 9         ┆ 7    

In [4]:
try:
    gpu_array = cp.arange(10)
    print(f"CuPy array on this device: {gpu_array.device}")
except cp.cuda.runtime.CUDARuntimeError as e:
    print(f"CuPy device can't be started: {e}")
    print("Make sure your CUDA drivers and cuPy installation are correct.")

CuPy array on this device: <CUDA Device 0>


In [55]:
embed_np_array = df_train_features['protein_embedding'].to_numpy().astype(np.float32)
print(f"NumPy array shape: {embed_np_array.shape}")
print(f"NumPy array dtype: {embed_np_array.dtype}")

NumPy array shape: (32763, 480)
NumPy array dtype: float32


In [56]:
embeds_cp_array = cp.array(embed_np_array)
print(f"CuPy array shape: {embeds_cp_array.shape}")
print(f"CuPy array dtype: {embeds_cp_array.dtype}")
print(f"CuPy array device: {embeds_cp_array.device}")

CuPy array shape: (32763, 480)
CuPy array dtype: float32
CuPy array device: <CUDA Device 0>


In [57]:
label_cols = list(filter(lambda x: x != 'protein_accession_id', df_train_labels.columns))
go_terms_cp_array = cp.array(df_train_labels.select(label_cols).to_numpy())
print(f"CuPy array shape: {go_terms_cp_array.shape}")
print(f"CuPy array dtype: {go_terms_cp_array.dtype}")
print(f"CuPy array device: {go_terms_cp_array.device}")

CuPy array shape: (32763, 100)
CuPy array dtype: float64
CuPy array device: <CUDA Device 0>


In [58]:
X_train_gpu, X_test_gpu, y_train_gpu, y_test_gpu = train_test_split(
    embeds_cp_array, go_terms_cp_array, test_size=0.1, random_state=42
)

print("\nShapes of train and test sets(GPU):")
print(f"X_train_gpu: {X_train_gpu.shape}")
print(f"X_test_gpu: {X_test_gpu.shape}")
print(f"y_train_gpu: {y_train_gpu.shape}")
print(f"y_test_gpu: {y_test_gpu.shape}")


Shapes of train and test sets(GPU):
X_train_gpu: (29487, 480)
X_test_gpu: (3276, 480)
y_train_gpu: (29487, 100)
y_test_gpu: (3276, 100)


In [13]:
type(y_train_gpu)

cupy.ndarray

In [16]:
df_test = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/test_protein_features_esm2_480.parquet')
print("Shape of test features", df_test.shape)
print(df_test.head(5))

Shape of test features (224309, 2)
shape: (5, 2)
┌──────────────────────┬─────────────────────────────────┐
│ protein_accession_id ┆ embedding_arrays                │
│ ---                  ┆ ---                             │
│ str                  ┆ array[f32, 480]                 │
╞══════════════════════╪═════════════════════════════════╡
│ Q8LX40               ┆ [-0.194824, -0.071106, … -0.33… │
│ Q8MHW5               ┆ [-0.164673, 0.078003, … -0.001… │
│ O00257               ┆ [-0.026779, -0.084534, … -0.02… │
│ A4FUD9               ┆ [-0.067444, -0.020065, … 0.046… │
│ Q9H1K1               ┆ [-0.268066, -0.078613, … -0.04… │
└──────────────────────┴─────────────────────────────────┘


In [17]:
submission_embed_np_array = df_test['embedding_arrays'].to_numpy().astype(np.float32)
print(f"NumPy array shape: {submission_embed_np_array.shape}")
print(f"NumPy array dtype: {submission_embed_np_array.dtype}")

submission_embeds_cp_array = cp.array(submission_embed_np_array)
print(f"CuPy array shape: {submission_embeds_cp_array.shape}")
print(f"CuPy array dtype: {submission_embeds_cp_array.dtype}")
print(f"CuPy array device: {submission_embeds_cp_array.device}")

NumPy array shape: (224309, 480)
NumPy array dtype: float32
CuPy array shape: (224309, 480)
CuPy array dtype: float32
CuPy array device: <CUDA Device 0>


In [42]:
prots_for_submission = np.array(pl.Series(df_test.select(pl.col('protein_accession_id'))).to_list())
print("prots_for_submission -- ", prots_for_submission.shape)

prots_for_submission --  (224309,)


In [59]:
base_model = cuCML_LogisticRegression(solver='qn', max_iter=1500)
multilabel_model = MultiOutputClassifier(base_model)
print("Training models...")
multilabel_model.fit(X_train_gpu.get(), y_train_gpu.get())
print("Training complete.")
predictions_gpu = multilabel_model.predict_proba(submission_embeds_cp_array.get())

Training models...
Training complete.


In [35]:
type(predictions_gpu[0])

numpy.ndarray

In [60]:
prob_positive = np.transpose([p[:, 1] for p in predictions_gpu])

In [61]:
prob_positive.shape

(224309, 100)

In [62]:
df_weights = pl.read_csv("/kaggle/input/cafa-6-protein-function-prediction/IA.tsv", separator="\t")
df_weights.columns = ['protein_accession_id', 'ia']
print("Shape of IA data", df_weights.shape)

label_cols = list(filter(lambda x: x != 'protein_accession_id', df_train_labels.columns))
print("length of label_cols -- ", len(label_cols))

df_weights_filtered = df_weights.filter(pl.col('protein_accession_id').is_in(label_cols))
print("shape of df_weights_filtered -- ", df_weights_filtered.shape)

labels = df_weights_filtered.shape[0]
class_wt_dict = {}

for i in range(labels):
    class_wt_dict[df_weights_filtered.item(i,0)] = round(df_weights_filtered.item(i,1),5)

Shape of IA data (40121, 2)
length of label_cols --  100
shape of df_weights_filtered --  (100, 2)


In [63]:
predictions_df_pl = pl.DataFrame(
    prob_positive, # Ensure data is a numpy array
    schema=label_cols # Assign the column names
)

In [44]:
predictions_df_pl.head(5)

GO:0032991,GO:0036126,GO:1990498,GO:0001673,GO:0005828,GO:0001939,GO:0097556,GO:0062157,GO:0000307,GO:0072534,GO:0000794,GO:0042025,GO:0001674,GO:0120281,GO:0097124,GO:0061202,GO:0101004,GO:0097133,GO:0045095,GO:0097128,GO:0120157,GO:0044754,GO:0097559,GO:0061474,GO:0036186,GO:0042585,GO:0097130,GO:0097129,GO:0002944,GO:0150056,GO:0001940,GO:0060053,GO:0005920,GO:0097134,GO:0045098,GO:0071752,GO:0020020,GO:0001741,GO:0150057,GO:0002945,GO:0097560,GO:0016533,GO:0000806,GO:0150058,GO:0097131,GO:0000805,GO:0097135,GO:0043663,GO:0097558,GO:1990860,GO:0043078,GO:0001740,GO:0097557,GO:0005919,GO:0098875,GO:0097451,GO:0044647,GO:0097426,GO:0097561,GO:0098577,GO:0097125,GO:0097554,GO:0097122,GO:0097123,GO:0032165,GO:0044302,GO:1990722,GO:0097132,GO:1990941,GO:0097555,protein_accession_id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.613784,0.360966,0.006189,0.011017,0.030286,0.004858,0.000759,0.003597,0.026268,0.005308,0.006381,0.001205,0.005451,0.003988,0.005054,0.001229,0.001423,3.9e-05,0.034002,0.002065,0.003126,0.00154,0.000759,0.00112,0.000577,0.010735,0.000158,0.00103,0.000721,0.000724,0.004446,0.002602,0.002141,0.003158,0.002285,0.000931,0.005616,0.003758,0.000724,0.001271,0.000759,0.007263,0.001242,0.004393,0.000724,0.005993,0.000562,0.002072,0.000759,0.004237,0.001533,0.001694,0.000759,0.000335,1.2e-05,0.003687,0.000286,0.0015,0.000759,0.000375,0.000716,0.000759,0.001233,0.001061,0.001181,0.001283,0.000222,0.000251,0.001266,0.000759,"""Q8LX40"""
0.754502,0.067325,0.022163,0.041932,0.012225,0.003869,0.000693,0.005577,0.007717,0.004932,0.038165,0.004811,0.022676,0.003428,0.001131,0.004537,0.004931,0.000157,0.0087,0.002318,0.003117,0.014798,0.000693,0.00497,0.002878,0.017379,0.00102,0.001802,0.000392,0.000364,0.003078,0.000453,0.003602,0.00038,0.002688,0.002756,0.004062,0.00699,0.000364,0.000912,0.000693,0.000377,0.001006,0.000575,0.00032,0.006457,0.001249,0.001596,0.000693,0.000362,0.002036,0.001162,0.000693,0.002089,0.000177,0.000918,0.001122,0.000802,0.000693,0.00059,0.001052,0.000693,0.001038,0.001021,0.000186,0.000423,0.000619,0.000218,0.000159,0.000693,"""Q8MHW5"""
0.536018,0.122169,0.035372,0.036434,0.023584,0.008513,0.000313,0.000424,0.008072,0.004361,0.036454,0.006353,0.005971,0.000982,0.00137,0.002026,0.000484,0.000994,0.018407,0.001081,0.000973,0.022065,0.000313,0.00557,0.009084,0.004356,0.001404,0.001241,0.000198,5.6e-05,0.005491,0.000643,0.001453,0.002023,0.010003,0.000989,0.004647,0.015539,5.6e-05,0.000163,0.000313,0.00058,0.001325,4.4e-05,0.000726,0.011604,0.00302,0.000864,0.000313,8.5e-05,0.001455,0.001854,0.000313,0.00096,0.000747,0.002038,0.000249,0.000245,0.000313,0.003693,0.002056,0.000313,0.0004,0.000991,0.000171,0.000276,0.000342,0.000877,0.000244,0.000313,"""O00257"""
0.567297,0.1105,0.012063,0.035549,0.009052,0.004526,0.000672,0.000414,0.034958,0.001445,0.039457,0.014395,0.00942,0.001683,0.001216,0.00916,0.001292,0.000829,0.028426,0.002882,0.002318,0.008368,0.000672,0.004103,0.001815,0.00697,0.000979,0.001564,0.000425,4.3e-05,0.005996,0.000652,0.001703,0.001969,0.005819,0.000887,0.010744,0.026226,4.3e-05,0.000383,0.000672,0.000458,0.00083,8.3e-05,0.00099,0.010284,0.000602,0.001959,0.000672,0.000241,0.003789,0.004578,0.000672,0.003351,0.00031,0.000336,0.000259,0.000272,0.000672,0.000665,0.00092,0.000672,0.001311,0.00059,0.00021,0.000327,0.000965,0.000545,0.000178,0.000672,"""A4FUD9"""
0.505738,0.068612,0.00802,0.166682,0.026662,0.005181,0.00094,0.002807,0.005288,0.000885,0.062298,0.018665,0.00495,0.000889,0.001123,0.001662,0.000967,5.8e-05,0.045623,0.001611,0.00531,0.002907,0.00094,0.002999,0.002903,0.007513,0.000203,0.00072,0.001263,0.000517,0.004328,0.004222,0.005116,0.001861,0.004391,0.00103,0.002837,0.01504,0.000517,0.000695,0.00094,0.004714,0.000857,0.002039,0.000329,0.013097,0.001232,0.002381,0.00094,0.002861,0.000978,0.001526,0.00094,0.000276,3.1e-05,0.002148,0.000151,0.000751,0.00094,0.000191,0.000534,0.00094,0.000322,0.00071,0.001813,0.000695,0.000184,0.000197,0.001441,0.00094,"""Q9H1K1"""


In [64]:
predictions_df_pl = predictions_df_pl.with_columns(
    pl.Series(name="protein_accession_id", values=prots_for_submission)
)

In [69]:
long_format_df_2 = predictions_df_pl.melt(
    id_vars=["protein_accession_id"],          # Column to keep as identifier
    value_vars=label_cols,                  # Columns to melt into rows
    variable_name="go_term",                   # Name for the column containing GO terms
    value_name="probability"                   # Name for the column containing scores
)
long_format_df_2 = long_format_df_2.filter(pl.col("probability") > 0.35)
long_format_df_2.shape

(201166, 3)

In [70]:
dd.sql("select count(distinct(protein_accession_id)), count(distinct(go_term)) from long_format_df_2").pl()

count(DISTINCT protein_accession_id),count(DISTINCT go_term)
i64,i64
183379,9


In [53]:
long_format_df.write_csv("submission_df_cc_concat.tsv", separator="\t", include_header=False)