In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import polars as pl
import duckdb as dd
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
import pickle
import gc
import ctypes
from pathlib import Path
import logging
import json
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor
import datetime
from typing import Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [2]:
import cuml
import cudf
from cuml.linear_model import LogisticRegression as cuCML_LogisticRegression
import cupy as cp
from sklearn.multioutput import MultiOutputClassifier
from cuml.model_selection import train_test_split
# Optional: for evaluation
from sklearn.metrics import classification_report

In [3]:
df_train_features_cc_concat = pl.read_parquet('/kaggle/input/cafa6-protein-labels-features-depth-based-suman/cafa6_protein_labels_features_depth_based/train_protein_features_cc_concat.parquet')
print("Shape of CC concat training features", df_train_features_cc_concat.shape)
print(df_train_features_cc_concat.head(5))

df_train_labels_cc_concat = pl.read_parquet('/kaggle/input/cafa6-protein-labels-features-depth-based-suman/cafa6_protein_labels_features_depth_based/train_protein_labels_cc_concat.parquet')
print("Shape of CC concat training labels", df_train_labels_cc_concat.shape)
print(df_train_labels_cc_concat.head(5))

Shape of CC concat training features (1800, 2)
shape: (5, 2)
┌──────────────────────┬─────────────────────────────────┐
│ protein_accession_id ┆ protein_embedding               │
│ ---                  ┆ ---                             │
│ str                  ┆ array[f32, 480]                 │
╞══════════════════════╪═════════════════════════════════╡
│ F4HRV8               ┆ [-0.079529, 0.089905, … -0.061… │
│ Q9JIL8               ┆ [0.015381, -0.083191, … -0.033… │
│ Q8IYW5               ┆ [-0.179932, 0.081177, … -0.133… │
│ Q9JIX5               ┆ [-0.013557, -0.078125, … -0.03… │
│ P43601               ┆ [-0.094849, 0.158936, … 0.0582… │
└──────────────────────┴─────────────────────────────────┘
Shape of CC concat training labels (1800, 11)
shape: (5, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ protein_a ┆ GO:003299 ┆ GO:003612 ┆ GO:000167 ┆ … ┆ GO:000079 ┆ GO:000167 ┆ GO:004509 ┆ GO:00200 │
│ ccession_ ┆ 1         ┆ 6

In [10]:
label_cols = list(filter(lambda x: x != 'protein_accession_id', df_train_labels_cc_concat.columns))
y = df_train_labels_cc_concat[label_cols]

In [11]:
type(y)

polars.dataframe.frame.DataFrame

In [5]:
embeds = df_train_features_cc_concat['protein_embedding']

# 1. Get a flattened view of all embedding elements (zero-copy)
# The result is a 1D cudf Series containing all individual float values
flattened_embeddings = embeds.list.leaves 

# 2. Convert the flattened series to a 1D CuPy array
# This Series *should* implement the cuda array interface correctly
flattened_cupy_array = flattened_embeddings.to_cupy() 

# 3. Reshape the 1D array into the desired 2D matrix [n_samples, n_features]
n_samples = len(df_train_features_cc_concat)
# We calculate the number of features (embedding dimension) automatically
n_features = flattened_cupy_array.shape[0] // n_samples 

# The fixed X data, ready for cuML
X_fixed_cupy = flattened_cupy_array.reshape(n_samples, n_features)

print(f"Reshaped X shape: {X_fixed_cupy.shape}")

Reshaped X shape: (1800, 480)


In [6]:
# Define the base cuML Logistic Regression model
# Use 'saga' or 'qn' solver as they are supported in cuML
base_model = cuCML_LogisticRegression(solver='qn', max_iter=1000)

# Wrap the cuML model in the scikit-learn MultiOutputClassifier
multilabel_model = MultiOutputClassifier(base_model)

In [9]:
y_data = cp.asarray(y.values) 

print(f"X data type: {type(X_fixed_cupy)}")
print(f"y data type: {type(y_data)}")

from sklearn.model_selection import train_test_split as sk_train_test_split

RANDOM_STATE = 42
TEST_SIZE = 0.2 

# 1. Explicitly move data to CPU (NumPy arrays) using .get()
# This bypasses the "implicit conversion not allowed" check
X_cpu = X_fixed_cupy.get() 
y_cpu = y_data.get()

print(f"X data type on CPU: {type(X_cpu)}") # Should be numpy.ndarray
print(f"y data type on CPU: {type(y_cpu)}") # Should be numpy.ndarray

# 2. Perform the split using the CPU-based Scikit-learn function
X_train_cpu, X_test_cpu, y_train_cpu, y_test_cpu = sk_train_test_split(
    X_cpu, 
    y_cpu, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE
)

# 3. Move the resulting split data back to the GPU immediately
X_train = cp.asarray(X_train_cpu)
X_test = cp.asarray(X_test_cpu)
y_train = cp.asarray(y_train_cpu)
y_test = cp.asarray(y_test_cpu)

print("Shapes after CPU split and GPU transfer:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X data type: <class 'cupy.ndarray'>
y data type: <class 'cupy.ndarray'>
X data type on CPU: <class 'numpy.ndarray'>
y data type on CPU: <class 'numpy.ndarray'>
Shapes after CPU split and GPU transfer:
X_train shape: (1440, 480)
y_train shape: (1440, 10)
X_test shape: (360, 480)
y_test shape: (360, 10)


In [None]:
type(y_train)

In [12]:
print("Training models...")
multilabel_model.fit(X_fixed_cupy, y)
print("Training complete.")

Training models...
Training complete.


In [13]:
df_test = cudf.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/test_protein_features_esm2_480.parquet')
print("Shape of test features", df_test.shape)
print(df_test.head(5))

Shape of test features (224309, 2)
  protein_accession_id                                   embedding_arrays
0               Q8LX40  [-0.19482421875, -0.07110595703125, 0.11425781...
1               Q8MHW5  [-0.1646728515625, 0.0780029296875, 0.14001464...
2               O00257  [-0.0267791748046875, -0.08453369140625, 0.147...
3               A4FUD9  [-0.06744384765625, -0.0200653076171875, 0.071...
4               Q9H1K1  [-0.26806640625, -0.07861328125, 0.07849121093...


In [14]:
test_embeds = df_test['embedding_arrays']

# 1. Get a flattened view of all embedding elements (zero-copy)
# The result is a 1D cudf Series containing all individual float values
test_flattened_embeddings = test_embeds.list.leaves 

# 2. Convert the flattened series to a 1D CuPy array
# This Series *should* implement the cuda array interface correctly
test_flattened_cupy_array = test_flattened_embeddings.to_cupy() 

# 3. Reshape the 1D array into the desired 2D matrix [n_samples, n_features]
n_samples_test = len(df_test)
# We calculate the number of features (embedding dimension) automatically
n_features_test = test_flattened_cupy_array.shape[0] // n_samples_test 

# The fixed X data, ready for cuML
X_test_fixed_cupy = test_flattened_cupy_array.reshape(n_samples_test, n_features_test)

print(f"Reshaped X shape: {X_test_fixed_cupy.shape}")

Reshaped X shape: (224309, 480)


In [15]:
predictions_gpu = multilabel_model.predict(X_test_fixed_cupy)

TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly.

In [4]:
pl.Config(fmt_str_lengths=1000)
pl.Config.set_tbl_rows(1000)

polars.config.Config

In [None]:
df_train_features_cc = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/train_protein_features_cc.parquet')
print("Shape of CC training features", df_train_features_cc.shape)
print(df_train_features_cc.head(5))

df_train_labels_cc = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/train_protein_labels_cc.parquet')
print("Shape of CC training labels", df_train_labels_cc.shape)
print(df_train_labels_cc.head(5))

df_train_features_mf = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/train_protein_features_mf.parquet')
print("Shape of MF training features", df_train_features_mf.shape)
print(df_train_features_mf.head(5))

df_train_labels_mf = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/train_protein_labels_mf.parquet')
print("Shape of MF training labels", df_train_labels_mf.shape)
print(df_train_labels_mf.head(5))

df_train_features_bp = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/train_protein_features_bp.parquet')
print("Shape of BP training features", df_train_features_bp.shape)
print(df_train_features_bp.head(5))

df_train_labels_bp = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/train_protein_labels_bp.parquet')
print("Shape of BP training labels", df_train_labels_bp.shape)
print(df_train_labels_bp.head(5))

In [5]:
df_test_features = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/test_protein_features_esm2_480.parquet')
print("Shape of test data", df_test_features.shape)

Shape of test data (224309, 2)


In [6]:
df_weights = pl.read_csv("/kaggle/input/cafa-6-protein-function-prediction/IA.tsv", separator="\t")
df_weights.columns = ['protein_accession_id', 'ia']
print("Shape of IA data", df_weights.shape)

label_cols_cc_concat = list(filter(lambda x: x != 'protein_accession_id', df_train_labels_cc_concat.columns))
print(len(label_cols_cc_concat))

df_weights_cc_concat = df_weights.filter(pl.col('protein_accession_id').is_in(label_cols_cc_concat))
print(df_weights_cc_concat.shape)

Shape of IA data (40121, 2)
10
(10, 2)


In [7]:
labels = df_weights_cc_concat.shape[0]
class_wt_dict = {}

for i in range(labels):
    class_wt_dict[df_weights_cc_concat.item(i,0)] = round(df_weights_cc_concat.item(i,1),5)

In [8]:
class_wt_dict

{'GO:0000307': 1.09672,
 'GO:0000794': 0.12951,
 'GO:0001673': 0.59585,
 'GO:0001674': 1.80735,
 'GO:0005828': 2.10753,
 'GO:0020020': 1.21412,
 'GO:0032991': 2.16578,
 'GO:0036126': 0.24211,
 'GO:0045095': 1.65208,
 'GO:1990498': 1.08246}

In [10]:
def reorder_df(input_wt_df: pl.DataFrame, input_label_df: pl.DataFrame):
    label_wt_cols = pl.Series(input_wt_df.select(pl.col('protein_accession_id'))).to_list()
    label_wts = pl.Series(input_wt_df.select(pl.col('ia'))).to_list()
    all_label_cols = ['protein_accession_id'] + label_wt_cols
    input_label_df_reordered = input_label_df.select(all_label_cols)
    print(input_label_df_reordered.head(5))
    return input_label_df_reordered, label_wts

In [11]:
df_train_labels_cc_concat_reordered, label_wts_cc_concat = reorder_df(df_weights_cc_concat, df_train_labels_cc_concat)

shape: (5, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ protein_a ┆ GO:000030 ┆ GO:000079 ┆ GO:000167 ┆ … ┆ GO:003299 ┆ GO:003612 ┆ GO:004509 ┆ GO:19904 │
│ ccession_ ┆ 7         ┆ 4         ┆ 3         ┆   ┆ 1         ┆ 6         ┆ 5         ┆ 98       │
│ id        ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│ ---       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64      │
│ str       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ O00411    ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 1.0       ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ O43707    ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 1.0       ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ Q9C093    ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 1.0       

In [12]:
label_cols_cc_concat = list(filter(lambda x: x != 'protein_accession_id', df_train_labels_cc_concat_reordered.columns))
print("label_cols_cc_concat length -> ", len(label_cols_cc_concat))

label_cols_cc_concat length ->  10


In [13]:
print(dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_train_features_cc_concat ").pl())

print(dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_train_labels_cc_concat_reordered ").pl())

print(" --------------- ")

shape: (1, 2)
┌────────────────────────────┬───────────────┐
│ uniq_protein_accession_ids ┆ total_records │
│ ---                        ┆ ---           │
│ i64                        ┆ i64           │
╞════════════════════════════╪═══════════════╡
│ 1800                       ┆ 1800          │
└────────────────────────────┴───────────────┘
shape: (1, 2)
┌────────────────────────────┬───────────────┐
│ uniq_protein_accession_ids ┆ total_records │
│ ---                        ┆ ---           │
│ i64                        ┆ i64           │
╞════════════════════════════╪═══════════════╡
│ 1800                       ┆ 1800          │
└────────────────────────────┴───────────────┘
 --------------- 


In [14]:
dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_test_features ").pl()

uniq_protein_accession_ids,total_records
i64,i64
224309,224309


In [15]:
x_train_main_cc = dd.sql("select dl.*, df.protein_embedding \
from df_train_features_cc_concat df \
join df_train_labels_cc_concat_reordered dl \
on df.protein_accession_id = dl.protein_accession_id ").pl()
print(x_train_main_cc.shape)
print(x_train_main_cc.head(5))

print(" ************************* ")

(1800, 12)
shape: (5, 12)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ protein_a ┆ GO:000030 ┆ GO:000079 ┆ GO:000167 ┆ … ┆ GO:003612 ┆ GO:004509 ┆ GO:199049 ┆ protein_ │
│ ccession_ ┆ 7         ┆ 4         ┆ 3         ┆   ┆ 6         ┆ 5         ┆ 8         ┆ embeddin │
│ id        ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ g        │
│ ---       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│ str       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ array[f3 │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ 2, 480]  │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ O00411    ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ [-0.0927 │
│           ┆           ┆           ┆           ┆   ┆           ┆

In [16]:
## X
embeds = np.array(pl.Series(x_train_main_cc.select(pl.col('protein_embedding'))).to_list())

In [17]:
## Redundant
num_features = 480
embeds = embeds.reshape(-1, num_features)

In [18]:
embeds.shape

(1800, 480)

In [19]:
## y
go_terms = pl.Series(x_train_main_cc.select(label_cols_cc_concat)).to_numpy()

In [20]:
go_terms.shape

(1800, 10)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(embeds, go_terms, test_size=0.25, random_state=42)
print("shape of X_train --> ",X_train.shape)
print("shape of y_train --> ",y_train.shape)
print("shape of X_test --> ",X_test.shape)
print("shape of y_test --> ",y_test.shape)

shape of X_train -->  (1350, 480)
shape of y_train -->  (1350, 10)
shape of X_test -->  (450, 480)
shape of y_test -->  (450, 10)


In [None]:
base_lr = LogisticRegression(solver='saga', max_iter=500, random_state=42, class_weight='balanced')

# Use n_jobs=-1 for parallel training of the 1500 models
multilabel_classifier = OneVsRestClassifier(base_lr, n_jobs=-1)

print("Starting training...")

multilabel_classifier.fit(X_train, y_train)

print("Training complete.")

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create a pipeline to scale features and then apply OVR Logistic Regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ovr_logreg', OneVsRestClassifier(
        LogisticRegression(solver='saga', max_iter=1500, random_state=42, class_weight='balanced'),
        n_jobs=-1
    ))
])

# Fit the pipeline on the training data
print("Starting training...")

pipeline.fit(X_train, y_train)

print("Training complete.")

Starting training...




Training complete.


In [24]:
y_proba = pipeline.predict_proba(X_test)

In [25]:
y_proba.shape

(450, 10)

In [27]:
df_test_features.columns

['protein_accession_id', 'embedding_arrays']

In [28]:
## X
embeds_for_submission = np.array(pl.Series(df_test_features.select(pl.col('embedding_arrays'))).to_list())

In [39]:
prots_for_submission = np.array(pl.Series(df_test_features.select(pl.col('protein_accession_id'))).to_list())
prots_for_submission[:100].shape

(100,)

In [42]:
prots_for_submission[:100]

array(['Q8LX40', 'Q8MHW5', 'O00257', 'A4FUD9', 'Q9H1K1', 'Q6IRU7',
       'P56402', 'Q9LIR9', 'Q575S9', 'A5DH19', 'Q8R3E3', 'O54818',
       'P84847', 'C5MBA7', 'Q6DCT2', 'Q2UA42', 'P22508', 'Q9SIZ4',
       'A0A482APN3', 'Q55BZ5', 'P60177', 'O46080', 'Q4V645', 'Q8R2R5',
       'Q9TDK1', 'Q8TGJ7', 'A7L6A2', 'O35433', 'Q5PYH7', 'Q6AXX9',
       'P27840', 'P17809', 'Q5PRF0', 'Q16401', 'Q9LYU0', 'P10123',
       'Q8TGN1', 'Q8R1Z4', 'P82768', 'A7TPS8', 'Q08118', 'Q5ZMI4',
       'Q57572', 'C0HK41', 'P0DTT1', 'A7YY46', 'Q28915', 'P38763',
       'Q96FZ7', 'Q6NQ99', 'O94806', 'Q6CAP3', 'M3WHG5', 'Q6ZV60',
       'Q7YJW1', 'Q55EZ6', 'A1YL79', 'P09693', 'P54644', 'D3ZE85',
       'P41330', 'O46310', 'Q9ESD6', 'Q29RS0', 'C0HL12', 'Q68CJ9',
       'A1XQX0', 'Q7S7G7', 'Q68DN1', 'Q99041', 'Q9QZH6', 'A4QKK0',
       'A0A644F0Y1', 'Q0CUP6', 'Q9FK25', 'Q56A27', 'Q3SZC4', 'O34835',
       'O96008', 'Q91ZC5', 'Q7CQ05', 'Q9BRJ6', 'Q9VIH9', 'P27679',
       'P38356', 'Q62981', 'Q21751', 'A0A1D8QMG4', 'P9

In [31]:
embeds_for_submission[:100].shape

(100, 480)

In [32]:
y_test_final = pipeline.predict_proba(embeds_for_submission[:100])

In [37]:
type(y_test_final)

numpy.ndarray

In [36]:
label_cols_cc_concat

['GO:0000307',
 'GO:0000794',
 'GO:0001673',
 'GO:0001674',
 'GO:0005828',
 'GO:0020020',
 'GO:0032991',
 'GO:0036126',
 'GO:0045095',
 'GO:1990498']

In [43]:
predictions_df_pl = pl.DataFrame(
    y_test_final, # Ensure data is a numpy array
    schema=label_cols_cc_concat # Assign the column names
)

predictions_df_pl = predictions_df_pl.with_columns(
    pl.Series(name="protein_accession_id", values=prots_for_submission[:100])
)

print("Polars Predictions DataFrame:")
print(predictions_df_pl.head())
print(predictions_df_pl.schema)

Polars Predictions DataFrame:
shape: (5, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ GO:000030 ┆ GO:000079 ┆ GO:000167 ┆ GO:000167 ┆ … ┆ GO:003612 ┆ GO:004509 ┆ GO:199049 ┆ protein_ │
│ 7         ┆ 4         ┆ 3         ┆ 4         ┆   ┆ 6         ┆ 5         ┆ 8         ┆ accessio │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ n_id     │
│ f64       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 1.5791e-9 ┆ 1.2858e-1 ┆ 0.002281  ┆ 0.000001  ┆ … ┆ 0.001066  ┆ 0.000169  ┆ 2.2099e-7 ┆ Q8LX40   │
│           ┆ 4         ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ 0.000082  ┆ 0.000065  ┆ 0.000051  ┆ 0.000215

In [46]:
long_format_df = predictions_df_pl.melt(
    id_vars=["protein_accession_id"],          # Column to keep as identifier
    value_vars=label_cols_cc_concat,                  # Columns to melt into rows
    variable_name="go_term",                   # Name for the column containing GO terms
    value_name="probability"                   # Name for the column containing scores
)

long_format_df.shape

(1000, 3)

In [47]:
long_format_df = long_format_df.filter(pl.col("probability") > 0.30)
long_format_df.shape

(138, 3)

In [49]:
dd.sql("select count(distinct(protein_accession_id)), count(distinct(go_term)) from long_format_df").pl()

count(DISTINCT protein_accession_id),count(DISTINCT go_term)
i64,i64
97,10
