In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import polars as pl
import duckdb as dd
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
import pickle
import gc
import ctypes
from pathlib import Path
import logging
import json
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor
import datetime
from typing import Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

import cuml
import cudf
from cuml.linear_model import LogisticRegression as cuCML_LogisticRegression
import cupy as cp
from sklearn.multioutput import MultiOutputClassifier
from cuml.model_selection import train_test_split
# Optional: for evaluation
from sklearn.metrics import classification_report

In [3]:
print(f"Polars version: {pl.__version__}")
print(f"CuPy version: {cp.__version__}")

try:
    #import cuml
    print(f"cuML version: {cuml.__version__}")
    print(f"cuML version could be imported successfully.")
    
except ImportError as e:
    print(f"cuML could not be imported. Ensure RAPIDS is installed correctly. Error: {e}")
    # If Cuml cannot be imported, the rest of the notebook will not work.
    # In this case, it may make sense to stop Execution.
    raise

Polars version: 1.25.0
CuPy version: 13.6.0
cuML version: 25.02.01
cuML version could be imported successfully.


In [7]:
df_train_features = pl.read_parquet('/kaggle/input/cafa6-protein-labels-features-depth-based-suman/cafa6_protein_labels_features_depth_based/train_protein_features_cc_4.parquet')
print("Shape of training features", df_train_features.shape)
print(df_train_features.head(5))

df_train_labels = pl.read_parquet('/kaggle/input/cafa6-protein-labels-features-depth-based-suman/cafa6_protein_labels_features_depth_based/train_protein_labels_cc_4.parquet')
print("Shape of training labels", df_train_labels.shape)
print(df_train_labels.head(5))

Shape of training features (17036, 2)
shape: (5, 2)
┌──────────────────────┬─────────────────────────────────┐
│ protein_accession_id ┆ protein_embedding               │
│ ---                  ┆ ---                             │
│ str                  ┆ array[f32, 480]                 │
╞══════════════════════╪═════════════════════════════════╡
│ Q9H6T3               ┆ [-0.117188, 0.017776, … 0.0286… │
│ Q9Z1X2               ┆ [-0.002453, -0.030136, … -0.01… │
│ O96019               ┆ [-0.163452, 0.013245, … 0.1025… │
│ P70579               ┆ [-0.06366, 0.054779, … 0.03128… │
│ P41241               ┆ [-0.12854, -0.039978, … -0.027… │
└──────────────────────┴─────────────────────────────────┘
Shape of training labels (17036, 101)
shape: (5, 101)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ protein_a ┆ GO:009897 ┆ GO:000032 ┆ GO:000013 ┆ … ┆ GO:000577 ┆ GO:004818 ┆ GO:000566 ┆ GO:00426 │
│ ccession_ ┆ 8         ┆ 5         ┆ 9    

In [5]:
try:
    gpu_array = cp.arange(10)
    print(f"CuPy array on this device: {gpu_array.device}")
except cp.cuda.runtime.CUDARuntimeError as e:
    print(f"CuPy device can't be started: {e}")
    print("Make sure your CUDA drivers and cuPy installation are correct.")

CuPy array on this device: <CUDA Device 0>


In [19]:
embed_np_array = df_train_features['protein_embedding'].to_numpy().astype(np.float32)
print(f"NumPy array shape: {embed_np_array.shape}")
print(f"NumPy array dtype: {embed_np_array.dtype}")

NumPy array shape: (17036, 480)
NumPy array dtype: float32


In [20]:
embeds_cp_array = cp.array(embed_np_array)
print(f"CuPy array shape: {embeds_cp_array.shape}")
print(f"CuPy array dtype: {embeds_cp_array.dtype}")
print(f"CuPy array device: {embeds_cp_array.device}")

CuPy array shape: (17036, 480)
CuPy array dtype: float32
CuPy array device: <CUDA Device 0>


In [26]:
label_cols = list(filter(lambda x: x != 'protein_accession_id', df_train_labels.columns))
go_terms_cp_array = cp.array(df_train_labels.select(label_cols).to_numpy().flatten())
print(f"CuPy array shape: {go_terms_cp_array.shape}")
print(f"CuPy array dtype: {go_terms_cp_array.dtype}")
print(f"CuPy array device: {go_terms_cp_array.device}")

CuPy array shape: (17036, 100)
CuPy array dtype: float64
CuPy array device: <CUDA Device 0>


In [27]:
X_train_gpu, X_test_gpu, y_train_gpu, y_test_gpu = train_test_split(
    embeds_cp_array, go_terms_cp_array, test_size=0.1, random_state=42, stratify=go_terms_cp_array # Protect class distribution
)

print("\nShapes of train and test sets(GPU):")
print(f"X_train_gpu: {X_train_gpu.shape}")
print(f"X_test_gpu: {X_test_gpu.shape}")
print(f"y_train_gpu: {y_train_gpu.shape}")
print(f"y_test_gpu: {y_test_gpu.shape}")

TypeError: not all arguments converted during string formatting

In [None]:
embeds = df_train_features_cc_concat['protein_embedding']

# 1. Get a flattened view of all embedding elements (zero-copy)
# The result is a 1D cudf Series containing all individual float values
flattened_embeddings = embeds.list.leaves 

# 2. Convert the flattened series to a 1D CuPy array
# This Series *should* implement the cuda array interface correctly
flattened_cupy_array = flattened_embeddings.to_cupy() 

# 3. Reshape the 1D array into the desired 2D matrix [n_samples, n_features]
n_samples = len(df_train_features_cc_concat)
# We calculate the number of features (embedding dimension) automatically
n_features = flattened_cupy_array.shape[0] // n_samples 

# The fixed X data, ready for cuML
X_fixed_cupy = flattened_cupy_array.reshape(n_samples, n_features)

print(f"Reshaped X shape: {X_fixed_cupy.shape}")

In [None]:
# Define the base cuML Logistic Regression model
# Use 'saga' or 'qn' solver as they are supported in cuML
base_model = cuCML_LogisticRegression(solver='qn', max_iter=1000)

# Wrap the cuML model in the scikit-learn MultiOutputClassifier
multilabel_model = MultiOutputClassifier(base_model)

In [None]:
y_data = cp.asarray(y.values) 

print(f"X data type: {type(X_fixed_cupy)}")
print(f"y data type: {type(y_data)}")

from sklearn.model_selection import train_test_split as sk_train_test_split

RANDOM_STATE = 42
TEST_SIZE = 0.2 

# 1. Explicitly move data to CPU (NumPy arrays) using .get()
# This bypasses the "implicit conversion not allowed" check
X_cpu = X_fixed_cupy.get() 
y_cpu = y_data.get()

print(f"X data type on CPU: {type(X_cpu)}") # Should be numpy.ndarray
print(f"y data type on CPU: {type(y_cpu)}") # Should be numpy.ndarray

# 2. Perform the split using the CPU-based Scikit-learn function
X_train_cpu, X_test_cpu, y_train_cpu, y_test_cpu = sk_train_test_split(
    X_cpu, 
    y_cpu, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE
)

# 3. Move the resulting split data back to the GPU immediately
X_train = cp.asarray(X_train_cpu)
X_test = cp.asarray(X_test_cpu)
y_train = cp.asarray(y_train_cpu)
y_test = cp.asarray(y_test_cpu)

print("Shapes after CPU split and GPU transfer:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
print("Training models...")
multilabel_model.fit(X_fixed_cupy, y)
print("Training complete.")

In [None]:
df_test = cudf.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/test_protein_features_esm2_480.parquet')
print("Shape of test features", df_test.shape)
print(df_test.head(5))

In [None]:
test_embeds = df_test['embedding_arrays']

# 1. Get a flattened view of all embedding elements (zero-copy)
# The result is a 1D cudf Series containing all individual float values
test_flattened_embeddings = test_embeds.list.leaves 

# 2. Convert the flattened series to a 1D CuPy array
# This Series *should* implement the cuda array interface correctly
test_flattened_cupy_array = test_flattened_embeddings.to_cupy() 

# 3. Reshape the 1D array into the desired 2D matrix [n_samples, n_features]
n_samples_test = len(df_test)
# We calculate the number of features (embedding dimension) automatically
n_features_test = test_flattened_cupy_array.shape[0] // n_samples_test 

# The fixed X data, ready for cuML
X_test_fixed_cupy = test_flattened_cupy_array.reshape(n_samples_test, n_features_test)

print(f"Reshaped X shape: {X_test_fixed_cupy.shape}")

In [None]:
predictions_gpu = multilabel_model.predict(X_test_fixed_cupy)