In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import polars as pl
import duckdb as dd
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
import pickle
import gc
import ctypes
from pathlib import Path
import logging
import json
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor
import datetime
from typing import Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
pl.Config(fmt_str_lengths=1000)
pl.Config.set_tbl_rows(1000)

polars.config.Config

In [3]:
df_train_features = pl.read_parquet('/kaggle/input/cafa6-protein-labels-features-depth-based-suman/cafa6_protein_labels_features_depth_based/train_protein_features_cc_4.parquet')
print("Shape of training features", df_train_features.shape)
print(df_train_features.head(5))

df_train_labels = pl.read_parquet('/kaggle/input/cafa6-protein-labels-features-depth-based-suman/cafa6_protein_labels_features_depth_based/train_protein_labels_cc_4.parquet')
print("Shape of training labels", df_train_labels.shape)
print(df_train_labels.head(5))

Shape of training features (17036, 2)
shape: (5, 2)
┌──────────────────────┬─────────────────────────────────────┐
│ protein_accession_id ┆ protein_embedding                   │
│ ---                  ┆ ---                                 │
│ str                  ┆ array[f32, 480]                     │
╞══════════════════════╪═════════════════════════════════════╡
│ Q9H6T3               ┆ [-0.117188, 0.017776, … 0.028687]   │
│ Q9Z1X2               ┆ [-0.002453, -0.030136, … -0.010384] │
│ O96019               ┆ [-0.163452, 0.013245, … 0.102539]   │
│ P70579               ┆ [-0.06366, 0.054779, … 0.031281]    │
│ P41241               ┆ [-0.12854, -0.039978, … -0.027496]  │
└──────────────────────┴─────────────────────────────────────┘
Shape of training labels (17036, 101)
shape: (5, 101)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ protein_a ┆ GO:009897 ┆ GO:000032 ┆ GO:000013 ┆ … ┆ GO:000577 ┆ GO:004818 ┆ GO:000566 ┆ GO:00426 │

In [4]:
df_test_features = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/test_protein_features_esm2_480.parquet')
print("Shape of test data", df_test_features.shape)

## X
embeds_for_submission = np.array(pl.Series(df_test_features.select(pl.col('embedding_arrays'))).to_list())
print("embeds_for_submission -- ", embeds_for_submission.shape)

prots_for_submission = np.array(pl.Series(df_test_features.select(pl.col('protein_accession_id'))).to_list())
print("prots_for_submission -- ", prots_for_submission.shape)

Shape of test data (224309, 2)
embeds_for_submission --  (224309, 480)
prots_for_submission --  (224309,)


In [5]:
df_weights = pl.read_csv("/kaggle/input/cafa-6-protein-function-prediction/IA.tsv", separator="\t")
df_weights.columns = ['protein_accession_id', 'ia']
print("Shape of IA data", df_weights.shape)

label_cols = list(filter(lambda x: x != 'protein_accession_id', df_train_labels.columns))
print("length of label_cols -- ", len(label_cols))

df_weights_filtered = df_weights.filter(pl.col('protein_accession_id').is_in(label_cols))
print("shape of df_weights_filtered -- ", df_weights_filtered.shape)

labels = df_weights_filtered.shape[0]
class_wt_dict = {}

for i in range(labels):
    class_wt_dict[df_weights_filtered.item(i,0)] = round(df_weights_filtered.item(i,1),5)

Shape of IA data (40121, 2)
length of label_cols --  100
shape of df_weights_filtered --  (100, 2)


In [6]:
def reorder_df(input_wt_df: pl.DataFrame, input_label_df: pl.DataFrame):
    label_wt_cols = pl.Series(input_wt_df.select(pl.col('protein_accession_id'))).to_list()
    label_wts = pl.Series(input_wt_df.select(pl.col('ia'))).to_list()
    all_label_cols = ['protein_accession_id'] + label_wt_cols
    input_label_df_reordered = input_label_df.select(all_label_cols)
    print(input_label_df_reordered.head(5))
    return input_label_df_reordered, label_wts

In [7]:
df_train_labels_reordered, label_wts = reorder_df(df_weights_filtered, df_train_labels)

shape: (5, 101)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ protein_a ┆ GO:000012 ┆ GO:000013 ┆ GO:000032 ┆ … ┆ GO:009950 ┆ GO:009963 ┆ GO:011008 ┆ GO:01402 │
│ ccession_ ┆ 4         ┆ 9         ┆ 5         ┆   ┆ 3         ┆ 4         ┆ 5         ┆ 20       │
│ id        ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│ ---       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64      │
│ str       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ P62138    ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ Q9FEG2    ┆ 0.0       ┆ 0.0       ┆ 1.0       ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ Q9MAP8    ┆ 0.0       ┆ 1.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 0.0      

In [8]:
label_cols_reordered = list(filter(lambda x: x != 'protein_accession_id', df_train_labels_reordered.columns))
print("label_cols_reordered length -> ", len(label_cols_reordered))

label_cols_reordered length ->  100


In [9]:
print(dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_train_features ").pl())

print(dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_train_labels_reordered ").pl())

print(" --------------- ")

shape: (1, 2)
┌────────────────────────────┬───────────────┐
│ uniq_protein_accession_ids ┆ total_records │
│ ---                        ┆ ---           │
│ i64                        ┆ i64           │
╞════════════════════════════╪═══════════════╡
│ 17036                      ┆ 17036         │
└────────────────────────────┴───────────────┘
shape: (1, 2)
┌────────────────────────────┬───────────────┐
│ uniq_protein_accession_ids ┆ total_records │
│ ---                        ┆ ---           │
│ i64                        ┆ i64           │
╞════════════════════════════╪═══════════════╡
│ 17036                      ┆ 17036         │
└────────────────────────────┴───────────────┘
 --------------- 


In [10]:
dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_test_features ").pl()

uniq_protein_accession_ids,total_records
i64,i64
224309,224309


In [11]:
x_train_main = dd.sql("select dl.*, df.protein_embedding \
from df_train_features df \
join df_train_labels_reordered dl \
on df.protein_accession_id = dl.protein_accession_id ").pl()
print("x_train_main shape --", x_train_main.shape)
print(x_train_main.head(5))

print(" ************************* ")

x_train_main shape -- (17036, 102)
shape: (5, 102)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ protein_a ┆ GO:000012 ┆ GO:000013 ┆ GO:000032 ┆ … ┆ GO:009963 ┆ GO:011008 ┆ GO:014022 ┆ protein_ │
│ ccession_ ┆ 4         ┆ 9         ┆ 5         ┆   ┆ 4         ┆ 5         ┆ 0         ┆ embeddin │
│ id        ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ g        │
│ ---       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│ str       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ array[f3 │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ 2, 480]  │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ P62138    ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ [-0.2043 │
│           ┆           ┆           ┆   

## Creating the pipeline that will break down the labels to a manageable set

In [None]:
num_features = 480
submission_filename_prefix = "submission_df_cc_4_"

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ovr_logreg', OneVsRestClassifier(
        LogisticRegression(solver='saga', max_iter=2500, random_state=42, class_weight='balanced'),
        n_jobs=-1
    ))
])

for i in range(0,len(label_cols_reordered),10):
    label_cols_filtered = label_cols_reordered[i:i+10]
    print("Run training for ", label_cols_filtered)
    
    embeds = np.array(pl.Series(x_train_main.select(pl.col('protein_embedding'))).to_list())
    embeds = embeds.reshape(-1, num_features)
    print("embeds.shape -- ",embeds.shape)
    go_terms = pl.Series(x_train_main.select(label_cols_filtered)).to_numpy()
    print("go_terms.shape -- ",go_terms.shape)
    
    print("Starting training...")
    pipeline.fit(embeds, go_terms)
    print("Training complete.")

    y_test_final = pipeline.predict_proba(embeds_for_submission)
    print("y_test_final.shape -- ",y_test_final.shape)

    predictions_df_pl = pl.DataFrame(
        y_test_final, # Ensure data is a numpy array
        schema=label_cols_filtered # Assign the column names
    )
    
    predictions_df_pl = predictions_df_pl.with_columns(
        pl.Series(name="protein_accession_id", values=prots_for_submission)
    )
    
    print("Polars Predictions DataFrame:")
    print(predictions_df_pl.head())
    print(predictions_df_pl.schema)

    long_format_df = predictions_df_pl.melt(
        id_vars=["protein_accession_id"],          # Column to keep as identifier
        value_vars=label_cols_cc_concat,                  # Columns to melt into rows
        variable_name="go_term",                   # Name for the column containing GO terms
        value_name="probability"                   # Name for the column containing scores
    )
    long_format_df = long_format_df.filter(pl.col("probability") > 0.75)
    long_format_df.shape
    print("long_format_df.shape -- ", long_format_df.shape)
    filename = submission_filename_prefix + i + ".tsv"
    long_format_df.write_csv(filename, separator="\t", include_header=False)
    
    del long_format_df, predictions_df_pl, embeds, go_terms, y_test_final, x_train_main_filtered, label_cols_filtered
    gc.collect()

Run training for  ['GO:0000124', 'GO:0000139', 'GO:0000325', 'GO:0000779', 'GO:0000812', 'GO:0000932', 'GO:0001533', 'GO:0001669', 'GO:0005604', 'GO:0005637']
embeds.shape --  (17036, 480)
go_terms.shape --  (17036, 10)
Starting training...


In [None]:
label_cols

df_weights_filtered = df_weights.filter(pl.col('protein_accession_id').is_in(label_cols))
print("shape of df_weights_filtered -- ", df_weights_filtered.shape)

def training_pipeline(train_df, labels_df):
    

In [123]:
## X
embeds = np.array(pl.Series(x_train_main.select(pl.col('protein_embedding'))).to_list())

In [124]:
## Redundant
num_features = 480
embeds = embeds.reshape(-1, num_features)

In [125]:
embeds.shape

(17036, 480)

In [126]:
## y
go_terms = pl.Series(x_train_main.select(label_cols_cc_concat)).to_numpy()

In [127]:
go_terms.shape

(17036, 100)

In [128]:
X_train, X_test, y_train, y_test = train_test_split(embeds, go_terms, test_size=0.1, random_state=42)
print("shape of X_train --> ",X_train.shape)
print("shape of y_train --> ",y_train.shape)
print("shape of X_test --> ",X_test.shape)
print("shape of y_test --> ",y_test.shape)

shape of X_train -->  (15332, 480)
shape of y_train -->  (15332, 100)
shape of X_test -->  (1704, 480)
shape of y_test -->  (1704, 100)


In [129]:
# Create a pipeline to scale features and then apply OVR Logistic Regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ovr_logreg', OneVsRestClassifier(
        LogisticRegression(solver='saga', max_iter=2500, random_state=42, class_weight='balanced'),
        n_jobs=-1
    ))
])

# Fit the pipeline on the training data
print("Starting training...")

pipeline.fit(X_train, y_train)

print("Training complete.")

Starting training...




KeyboardInterrupt: 

In [105]:
y_proba = pipeline.predict_proba(X_test)

In [106]:
y_proba.shape

(6432, 20)

In [27]:
df_test_features.columns

['protein_accession_id', 'embedding_arrays']

In [80]:
prots_for_submission[:100]

array(['Q8LX40', 'Q8MHW5', 'O00257', 'A4FUD9', 'Q9H1K1', 'Q6IRU7',
       'P56402', 'Q9LIR9', 'Q575S9', 'A5DH19', 'Q8R3E3', 'O54818',
       'P84847', 'C5MBA7', 'Q6DCT2', 'Q2UA42', 'P22508', 'Q9SIZ4',
       'A0A482APN3', 'Q55BZ5', 'P60177', 'O46080', 'Q4V645', 'Q8R2R5',
       'Q9TDK1', 'Q8TGJ7', 'A7L6A2', 'O35433', 'Q5PYH7', 'Q6AXX9',
       'P27840', 'P17809', 'Q5PRF0', 'Q16401', 'Q9LYU0', 'P10123',
       'Q8TGN1', 'Q8R1Z4', 'P82768', 'A7TPS8', 'Q08118', 'Q5ZMI4',
       'Q57572', 'C0HK41', 'P0DTT1', 'A7YY46', 'Q28915', 'P38763',
       'Q96FZ7', 'Q6NQ99', 'O94806', 'Q6CAP3', 'M3WHG5', 'Q6ZV60',
       'Q7YJW1', 'Q55EZ6', 'A1YL79', 'P09693', 'P54644', 'D3ZE85',
       'P41330', 'O46310', 'Q9ESD6', 'Q29RS0', 'C0HL12', 'Q68CJ9',
       'A1XQX0', 'Q7S7G7', 'Q68DN1', 'Q99041', 'Q9QZH6', 'A4QKK0',
       'A0A644F0Y1', 'Q0CUP6', 'Q9FK25', 'Q56A27', 'Q3SZC4', 'O34835',
       'O96008', 'Q91ZC5', 'Q7CQ05', 'Q9BRJ6', 'Q9VIH9', 'P27679',
       'P38356', 'Q62981', 'Q21751', 'A0A1D8QMG4', 'P9

In [31]:
embeds_for_submission[:100].shape

(100, 480)

In [107]:
y_test_final = pipeline.predict_proba(embeds_for_submission)

In [108]:
y_test_final.shape

(224309, 20)

In [109]:
label_cols_cc_concat

['GO:0005635',
 'GO:0005739',
 'GO:0005768',
 'GO:0005783',
 'GO:0005794',
 'GO:0005813',
 'GO:0005886',
 'GO:0005938',
 'GO:0009506',
 'GO:0009536',
 'GO:0009570',
 'GO:0009897',
 'GO:0016324',
 'GO:0016604',
 'GO:0022626',
 'GO:0031012',
 'GO:0036064',
 'GO:0043025',
 'GO:0045202',
 'GO:0070062']

In [110]:
predictions_df_pl = pl.DataFrame(
    y_test_final, # Ensure data is a numpy array
    schema=label_cols_cc_concat # Assign the column names
)

predictions_df_pl = predictions_df_pl.with_columns(
    pl.Series(name="protein_accession_id", values=prots_for_submission)
)

print("Polars Predictions DataFrame:")
print(predictions_df_pl.head())
print(predictions_df_pl.schema)

Polars Predictions DataFrame:
shape: (5, 21)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ GO:000563 ┆ GO:000573 ┆ GO:000576 ┆ GO:000578 ┆ … ┆ GO:004302 ┆ GO:004520 ┆ GO:007006 ┆ protein_ │
│ 5         ┆ 9         ┆ 8         ┆ 3         ┆   ┆ 5         ┆ 2         ┆ 2         ┆ accessio │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ n_id     │
│ f64       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 0.000027  ┆ 0.709754  ┆ 0.022139  ┆ 0.032261  ┆ … ┆ 0.00124   ┆ 1.9631e-1 ┆ 0.000145  ┆ Q8LX40   │
│           ┆           ┆           ┆           ┆   ┆           ┆ 5         ┆           ┆          │
│ 0.226153  ┆ 0.041259  ┆ 0.828655  ┆ 0.717441

In [111]:
long_format_df = predictions_df_pl.melt(
    id_vars=["protein_accession_id"],          # Column to keep as identifier
    value_vars=label_cols_cc_concat,                  # Columns to melt into rows
    variable_name="go_term",                   # Name for the column containing GO terms
    value_name="probability"                   # Name for the column containing scores
)

long_format_df.shape

(4486180, 3)

In [112]:
long_format_df = long_format_df.filter(pl.col("probability") > 0.75)
long_format_df.shape

(490034, 3)

In [113]:
dd.sql("select count(distinct(protein_accession_id)), count(distinct(go_term)) from long_format_df").pl()

count(DISTINCT protein_accession_id),count(DISTINCT go_term)
i64,i64
206344,20


In [114]:
long_format_df.write_csv("submission_df_cc_3.tsv", separator="\t", include_header=False)