In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import polars as pl
import duckdb as dd
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
import pickle
import gc
import ctypes
from pathlib import Path
import logging
import json
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor
import datetime
from typing import Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [2]:
pl.Config(fmt_str_lengths=1000)
pl.Config.set_tbl_rows(1000)

polars.config.Config

In [3]:
df_train_features_cc = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/train_protein_features_cc.parquet')
print("Shape of CC training features", df_train_features_cc.shape)
print(df_train_features_cc.head(5))

df_train_labels_cc = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/train_protein_labels_cc.parquet')
print("Shape of CC training labels", df_train_labels_cc.shape)
print(df_train_labels_cc.head(5))

df_train_features_mf = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/train_protein_features_mf.parquet')
print("Shape of MF training features", df_train_features_mf.shape)
print(df_train_features_mf.head(5))

df_train_labels_mf = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/train_protein_labels_mf.parquet')
print("Shape of MF training labels", df_train_labels_mf.shape)
print(df_train_labels_mf.head(5))

df_train_features_bp = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/train_protein_features_bp.parquet')
print("Shape of BP training features", df_train_features_bp.shape)
print(df_train_features_bp.head(5))

df_train_labels_bp = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/train_protein_labels_bp.parquet')
print("Shape of BP training labels", df_train_labels_bp.shape)
print(df_train_labels_bp.head(5))

df_test_features = pl.read_parquet('/kaggle/input/cafa6-protein-go-terms-feat-labels/test_protein_features_esm2_320.parquet')
print("Shape of test data", df_test_features.shape)

Shape of CC training features (60030, 2)
shape: (5, 2)
┌──────────────────────┬─────────────────────────────────────┐
│ protein_accession_id ┆ protein_embedding                   │
│ ---                  ┆ ---                                 │
│ str                  ┆ array[f32, 320]                     │
╞══════════════════════╪═════════════════════════════════════╡
│ Q8CJ40               ┆ [-0.203247, 0.004642, … -0.187378]  │
│ P50149               ┆ [0.000173, -0.111328, … 0.034119]   │
│ P09956               ┆ [-0.240723, -0.078369, … -0.03595]  │
│ Q60664               ┆ [-0.057404, -0.216431, … -0.043365] │
│ P25443               ┆ [-0.137085, -0.125977, … -0.002178] │
└──────────────────────┴─────────────────────────────────────┘
Shape of CC training labels (60030, 1501)
shape: (5, 1_501)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ protein_a ┆ GO:000565 ┆ GO:190255 ┆ GO:001659 ┆ … ┆ GO:007069 ┆ GO:000595 ┆ GO:003201 ┆ G

In [5]:
df_weights = pl.read_csv("/kaggle/input/cafa-6-protein-function-prediction/IA.tsv", separator="\t")
df_weights.columns = ['protein_accession_id', 'ia']
print("Shape of IA data", df_weights.shape)

label_cols_cc = list(filter(lambda x: x != 'protein_accession_id', df_train_labels_cc.columns))
print(len(label_cols_cc))

df_weights_cc = df_weights.filter(pl.col('protein_accession_id').is_in(label_cols_cc))
print(df_weights_cc.shape)

label_cols_mf = list(filter(lambda x: x != 'protein_accession_id', df_train_labels_mf.columns))
print(len(label_cols_mf))

df_weights_mf = df_weights.filter(pl.col('protein_accession_id').is_in(label_cols_mf))
print(df_weights_mf.shape)

label_cols_bp = list(filter(lambda x: x != 'protein_accession_id', df_train_labels_bp.columns))
print(len(label_cols_bp))

df_weights_bp = df_weights.filter(pl.col('protein_accession_id').is_in(label_cols_bp))
print(df_weights_bp.shape)

Shape of IA data (40121, 2)
1500
(1500, 2)
1500
(1500, 2)
1500
(1500, 2)


In [6]:
def reorder_df(input_wt_df: pl.DataFrame, input_label_df: pl.DataFrame):
    label_wt_cols = pl.Series(input_wt_df.select(pl.col('protein_accession_id'))).to_list()
    label_wts = pl.Series(input_wt_df.select(pl.col('ia'))).to_list()
    all_label_cols = ['protein_accession_id'] + label_wt_cols
    input_label_df_reordered = input_label_df.select(all_label_cols)
    print(input_label_df_reordered.head(5))
    return input_label_df_reordered, label_wts

In [7]:
df_train_labels_cc_reordered, label_wts_cc = reorder_df(df_weights_cc, df_train_labels_cc)
df_train_labels_mf_reordered, label_wts_mf = reorder_df(df_weights_mf, df_train_labels_mf)
df_train_labels_bp_reordered, label_wts_bp = reorder_df(df_weights_bp, df_train_labels_bp)

shape: (5, 1_501)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ protein_a ┆ GO:000001 ┆ GO:000010 ┆ GO:000011 ┆ … ┆ GO:199090 ┆ GO:199091 ┆ GO:199091 ┆ GO:19909 │
│ ccession_ ┆ 5         ┆ 9         ┆ 0         ┆   ┆ 9         ┆ 3         ┆ 7         ┆ 23       │
│ id        ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│ ---       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64      │
│ str       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ P28159    ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ Q7RTN6    ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ Q6JPI3    ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 0.0    

In [8]:
label_cols_cc = list(filter(lambda x: x != 'protein_accession_id', df_train_labels_cc_reordered.columns))
print("label_cols_cc length -> ", len(label_cols_cc))

label_cols_mf = list(filter(lambda x: x != 'protein_accession_id', df_train_labels_mf_reordered.columns))
print("label_cols_mf length -> ", len(label_cols_mf))

label_cols_bp = list(filter(lambda x: x != 'protein_accession_id', df_train_labels_bp_reordered.columns))
print("label_cols_bp length -> ", len(label_cols_bp))

label_cols_cc length ->  1500
label_cols_mf length ->  1500
label_cols_bp length ->  1500


In [9]:
print(dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_train_features_cc ").pl())

print(dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_train_labels_cc_reordered ").pl())

print(" --------------- ")

print(dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_train_features_mf ").pl())

print(dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_train_labels_mf_reordered ").pl())

print(" --------------- ")

print(dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_train_features_bp ").pl())

print(dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_train_labels_bp_reordered ").pl())

shape: (1, 2)
┌────────────────────────────┬───────────────┐
│ uniq_protein_accession_ids ┆ total_records │
│ ---                        ┆ ---           │
│ i64                        ┆ i64           │
╞════════════════════════════╪═══════════════╡
│ 60030                      ┆ 60030         │
└────────────────────────────┴───────────────┘
shape: (1, 2)
┌────────────────────────────┬───────────────┐
│ uniq_protein_accession_ids ┆ total_records │
│ ---                        ┆ ---           │
│ i64                        ┆ i64           │
╞════════════════════════════╪═══════════════╡
│ 60030                      ┆ 60030         │
└────────────────────────────┴───────────────┘
 --------------- 
shape: (1, 2)
┌────────────────────────────┬───────────────┐
│ uniq_protein_accession_ids ┆ total_records │
│ ---                        ┆ ---           │
│ i64                        ┆ i64           │
╞════════════════════════════╪═══════════════╡
│ 53503                      ┆ 53503         │


In [10]:
dd.sql(" select count(distinct(protein_accession_id)) as uniq_protein_accession_ids \
, count(1) as total_records from df_test_features ").pl()

uniq_protein_accession_ids,total_records
i64,i64
224309,224309


In [11]:
x_train_main_cc = dd.sql("select dl.*, df.protein_embedding \
from df_train_features_cc df \
join df_train_labels_cc_reordered dl \
on df.protein_accession_id = dl.protein_accession_id ").pl()
print(x_train_main_cc.shape)
print(x_train_main_cc.head(5))

print(" ************************* ")

x_train_main_mf = dd.sql("select dl.*, df.protein_embedding \
from df_train_features_mf df \
join df_train_labels_mf_reordered dl \
on df.protein_accession_id = dl.protein_accession_id ").pl()
print(x_train_main_mf.shape)
print(x_train_main_mf.head(5))

print(" ************************* ")

x_train_main_bp = dd.sql("select dl.*, df.protein_embedding \
from df_train_features_bp df \
join df_train_labels_bp_reordered dl \
on df.protein_accession_id = dl.protein_accession_id ").pl()
print(x_train_main_bp.shape)
print(x_train_main_bp.head(5))

(60030, 1502)
shape: (5, 1_502)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ protein_a ┆ GO:000001 ┆ GO:000010 ┆ GO:000011 ┆ … ┆ GO:199091 ┆ GO:199091 ┆ GO:199092 ┆ protein_ │
│ ccession_ ┆ 5         ┆ 9         ┆ 0         ┆   ┆ 3         ┆ 7         ┆ 3         ┆ embeddin │
│ id        ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ g        │
│ ---       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│ str       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ array[f3 │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ 2, 320]  │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ P28159    ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ [0.01866 │
│           ┆           ┆           ┆           ┆   ┆      

In [19]:
## X
embeds = np.array(pl.Series(x_train_main_cc.select(pl.col('protein_embedding'))).to_list())

In [21]:
## Redundant
num_features = 320
embeds = embeds.reshape(-1, num_features)

In [22]:
embeds.shape

(60030, 320)

In [23]:
## y
go_terms = pl.Series(x_train_main_cc.select(label_cols_cc)).to_numpy()

In [24]:
go_terms.shape

(60030, 1500)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(embeds, go_terms, test_size=0.3, random_state=42)
print("shape of X_train --> ",X_train.shape)
print("shape of y_train --> ",y_train.shape)
print("shape of X_test --> ",X_test.shape)
print("shape of y_test --> ",y_test.shape)

shape of X_train -->  (42021, 320)
shape of y_train -->  (42021, 1500)
shape of X_test -->  (18009, 320)
shape of y_test -->  (18009, 1500)
