In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [5]:
base_metadata_pth = Path('../../metadata')

basic_metadata_file = 'metadata.repository.2024-11-05.json'
clinical_cohort_file = 'clinical.cohort.2024-11-07.json'
biospecimen_file = 'biospecimen.cohort.2024-11-07.json'
gene_expr_file = 'Human__TCGA_OV__UNC__RNAseq__GA_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct'
# gene_expr_file = "Human__TCGA_OV__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct"

gene_mutation_file = 'Human__TCGA_OV__WUSM__Mutation__GAIIx__01_28_2016__BI__Gene__Firehose_MutSig2CV.cbt'

In [6]:
gene_names = pd.read_csv(base_metadata_pth / 'GeneNames.tsv', header=None)

In [7]:
metadata_df = pd.read_json(base_metadata_pth / basic_metadata_file)

clinical_cohort_df = pd.read_json(base_metadata_pth / clinical_cohort_file)

biospecimen_df = pd.read_json(base_metadata_pth / biospecimen_file)
gene_expr_df = pd.read_csv(base_metadata_pth / gene_expr_file, delimiter='\t')

gene_mutation_df = pd.read_csv(base_metadata_pth / gene_mutation_file,
                               delimiter='\t')

In [8]:
gene_mutations_relevant = pd.merge(
    gene_names, gene_mutation_df, left_on=0,
    right_on='attrib_name').iloc[:, 2:].T  #.reset_index()

In [9]:
latent_vec_pth = Path(
    '/Users/tsakalis/ntua/nestor/nestor_celvia/src/vae_embeddings/latent_vectors'
)

In [10]:
import torch

ModuleNotFoundError: No module named 'torch'

In [12]:
import h5py
import pandas as pd


def load_h5(path, key='mean'):
    """
    Load data from an HDF5 file and return the specified dataset as a DataFrame.

    Parameters:
    - path (str): Path to the HDF5 file.
    - key (str): Key of the dataset to retrieve.

    Returns:
    - pd.DataFrame: DataFrame containing the data from the specified key.
    """
    with h5py.File(path, "r") as f:
        if key not in f:
            raise KeyError(f"Key '{key}' not found in the HDF5 file.")

        ds_arr = f[key][()]  # Get the numpy array from the specified key

    # Return the data as a DataFrame
    return pd.DataFrame(ds_arr)


In [32]:
latent_vec_pth = Path(
    '../../vae_embeddings/averaged_down'
)
latent_vecs = [(pth.stem, load_h5(pth), load_h5(pth, key='max'),
                load_h5(pth, key='min'), load_h5(pth, key='std'))
               for pth in latent_vec_pth.glob('*.h5')]

# latent_vecs = [np.load(pth) for pth in latent_vec_pth.glob('*.npy')]

In [33]:
# latent_vec_pth = Path(
#     '/Users/tsakalis/ntua/nestor/nestor_celvia/src/vae_embeddings/embeddings_adco'
# )
# latent_vecs = [(pth.stem, torch.load(pth,
#                                      map_location=torch.device('cpu')).numpy())
#                for pth in latent_vec_pth.glob('*.pt')]

# # latent_vecs = [np.load(pth) for pth in latent_vec_pth.glob('*.npy')]

In [34]:
test_id = "TCGA-WR-A838-01A-01-TS1.E8CA96CD-A253-4090-86BA-60E7192B72FD"

In [35]:
all_latent_df = []

for latent in latent_vecs:
    latent_df_mean = pd.DataFrame(latent[1]).transpose()
    latent_df_max = pd.DataFrame(latent[2]).transpose()

    latent_df_min = pd.DataFrame(latent[3]).transpose()
    latent_df_std = pd.DataFrame(latent[4]).transpose()
    latent_df = pd.concat(
        [latent_df_mean, latent_df_max, latent_df_min, latent_df_std], axis=1)
    # break
    latent_df.columns = [f"emb_{i}" for i in range(4096)]
    latent_df['file_name'] = latent[0] + ".svs"  #.split('_')[0]

    all_latent_df.append(latent_df)

In [36]:
metadata_df['submitter_id'] = metadata_df.submitter_id.apply(
    lambda x: '.'.join(x.split('-')[:3]))

In [37]:
merged_genes = pd.merge(gene_expr_df.set_index('attrib_name'),
                        gene_names,
                        left_index=True,
                        right_on=0)


In [38]:
clinical_cohort_df['submitter_id'] = clinical_cohort_df['submitter_id'].apply(
    lambda x: x.replace('-', '.'))

In [39]:
gene_patient = pd.merge(metadata_df,
                        merged_genes.T,
                        left_on='submitter_id',
                        right_index=True)

In [40]:
gene_mutated_patient = pd.merge(metadata_df,
                                gene_mutations_relevant,
                                left_on='submitter_id',
                                right_index=True)

In [41]:
gene_mutated_patient


Unnamed: 0,data_format,access,associated_entities,file_name,submitter_id,data_category,annotations,file_size,md5sum,file_id,...,30,31,32,33,34,35,36,37,38,39
0,SVS,open,[{'entity_submitter_id': 'TCGA-61-1903-01A-01-...,TCGA-61-1903-01A-01-BS1.77116a06-9e30-4bf6-885...,TCGA.61.1903,Biospecimen,"[{'entity_submitter_id': 'TCGA-61-1903', 'note...",200210513,05da084e2d65c34aa87bf865483f8b6d,13c2fa97-02ed-4442-aad8-9c4e6b365adc,...,0,0,0,0,0,0,0,0,0,0
2,SVS,open,[{'entity_submitter_id': 'TCGA-42-2587-01A-01-...,TCGA-42-2587-01A-01-TS1.f9c60f94-e626-4e40-849...,TCGA.42.2587,Biospecimen,"[{'entity_submitter_id': 'TCGA-42-2587', 'note...",138771709,c9d89dc3808a0df5bcfb7fe39994b8ca,d93b71fa-bfe0-4402-876a-b51edef5ef86,...,0,0,0,0,0,0,0,0,0,0
5,SVS,open,[{'entity_submitter_id': 'TCGA-42-2588-01A-01-...,TCGA-42-2588-01A-01-TS1.cc3b36dc-1ce9-4db3-998...,TCGA.42.2588,Biospecimen,"[{'entity_submitter_id': 'TCGA-42-2588', 'note...",116102537,9f04fbc5ce72fabcb0a1009372df12c5,525e99fd-2d3f-49b9-bf2f-bbacce843c16,...,0,0,0,0,0,0,0,0,0,0
6,SVS,open,[{'entity_submitter_id': 'TCGA-24-1416-01A-01-...,TCGA-24-1416-01A-01-BS1.9af08ca1-7925-4689-9ad...,TCGA.24.1416,Biospecimen,,160023955,d3d86ff3007ce6ac4adf69bd91dee64d,3170f418-ead5-48b7-9e49-6bc256f36e75,...,0,0,0,0,0,0,0,0,0,0
7,SVS,open,[{'entity_submitter_id': 'TCGA-25-1635-01A-01-...,TCGA-25-1635-01A-01-TS1.e3fb13c5-3313-4116-9af...,TCGA.25.1635,Biospecimen,,438392073,c3ab37ec807838c15042be1d5822c4da,4c90835a-125c-41a1-a2db-42d5a578c4f7,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364,SVS,open,[{'entity_submitter_id': 'TCGA-13-1484-01A-01-...,TCGA-13-1484-01A-01-BS1.dfebf9da-d2d8-42cd-a94...,TCGA.13.1484,Biospecimen,,271158307,17bad5ae203854c3fa27beeaeb1d51ce,a20a6cb2-2da0-491d-a22a-3bd74fb3062e,...,0,0,0,0,0,0,0,0,0,0
1365,SVS,open,[{'entity_submitter_id': 'TCGA-13-0906-01A-01-...,TCGA-13-0906-01A-01-BS1.e8aa0544-cc48-4941-9d7...,TCGA.13.0906,Biospecimen,,175722371,dff143da3e0fb16bf4b0559a2a92af77,81a8c4e8-f40d-4e3e-b18c-f869917c50cb,...,0,0,0,0,0,0,0,0,0,0
1366,SVS,open,[{'entity_submitter_id': 'TCGA-13-0906-01A-01-...,TCGA-13-0906-01A-01-TS1.7be7e649-9db2-4a60-b12...,TCGA.13.0906,Biospecimen,,315044077,2e9988fa5644ccd92fcc669d9d369e24,d1217464-aa0f-4f9e-aeb0-9513886939c5,...,0,0,0,0,0,0,0,0,0,0
1367,SVS,open,[{'entity_submitter_id': 'TCGA-04-1348-01A-01-...,TCGA-04-1348-01A-01-TS1.ffb07f65-72b7-494c-abf...,TCGA.04.1348,Biospecimen,,109640757,1d974d73037217e9a9d97e08023d6eeb,019607b4-e183-46ee-b062-9abcbe54ceb5,...,0,0,0,0,0,0,0,0,0,0


In [42]:
metadata_latent = pd.merge(metadata_df,
                           pd.concat(all_latent_df),
                           on='file_name')

In [43]:
all_data = pd.merge(metadata_latent,
                    gene_patient.reset_index(drop=True),
                    on='file_id',
                    how='inner')

In [44]:
all_data_mutated = pd.merge(metadata_latent,
                            gene_mutated_patient.reset_index(drop=True),
                            on='file_id',
                            how='inner')

In [45]:
all_data_mutated

Unnamed: 0,data_format_x,access_x,associated_entities_x,file_name_x,submitter_id_x,data_category_x,annotations_x,file_size_x,md5sum_x,file_id,...,30,31,32,33,34,35,36,37,38,39
0,SVS,open,[{'entity_submitter_id': 'TCGA-61-1903-01A-01-...,TCGA-61-1903-01A-01-BS1.77116a06-9e30-4bf6-885...,TCGA.61.1903,Biospecimen,"[{'entity_submitter_id': 'TCGA-61-1903', 'note...",200210513,05da084e2d65c34aa87bf865483f8b6d,13c2fa97-02ed-4442-aad8-9c4e6b365adc,...,0,0,0,0,0,0,0,0,0,0
1,SVS,open,[{'entity_submitter_id': 'TCGA-42-2587-01A-01-...,TCGA-42-2587-01A-01-TS1.f9c60f94-e626-4e40-849...,TCGA.42.2587,Biospecimen,"[{'entity_submitter_id': 'TCGA-42-2587', 'note...",138771709,c9d89dc3808a0df5bcfb7fe39994b8ca,d93b71fa-bfe0-4402-876a-b51edef5ef86,...,0,0,0,0,0,0,0,0,0,0
2,SVS,open,[{'entity_submitter_id': 'TCGA-42-2588-01A-01-...,TCGA-42-2588-01A-01-TS1.cc3b36dc-1ce9-4db3-998...,TCGA.42.2588,Biospecimen,"[{'entity_submitter_id': 'TCGA-42-2588', 'note...",116102537,9f04fbc5ce72fabcb0a1009372df12c5,525e99fd-2d3f-49b9-bf2f-bbacce843c16,...,0,0,0,0,0,0,0,0,0,0
3,SVS,open,[{'entity_submitter_id': 'TCGA-24-1416-01A-01-...,TCGA-24-1416-01A-01-BS1.9af08ca1-7925-4689-9ad...,TCGA.24.1416,Biospecimen,,160023955,d3d86ff3007ce6ac4adf69bd91dee64d,3170f418-ead5-48b7-9e49-6bc256f36e75,...,0,0,0,0,0,0,0,0,0,0
4,SVS,open,[{'entity_submitter_id': 'TCGA-25-1635-01A-01-...,TCGA-25-1635-01A-01-TS1.e3fb13c5-3313-4116-9af...,TCGA.25.1635,Biospecimen,,438392073,c3ab37ec807838c15042be1d5822c4da,4c90835a-125c-41a1-a2db-42d5a578c4f7,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,SVS,open,[{'entity_submitter_id': 'TCGA-13-1484-01A-01-...,TCGA-13-1484-01A-01-BS1.dfebf9da-d2d8-42cd-a94...,TCGA.13.1484,Biospecimen,,271158307,17bad5ae203854c3fa27beeaeb1d51ce,a20a6cb2-2da0-491d-a22a-3bd74fb3062e,...,0,0,0,0,0,0,0,0,0,0
1067,SVS,open,[{'entity_submitter_id': 'TCGA-13-0906-01A-01-...,TCGA-13-0906-01A-01-BS1.e8aa0544-cc48-4941-9d7...,TCGA.13.0906,Biospecimen,,175722371,dff143da3e0fb16bf4b0559a2a92af77,81a8c4e8-f40d-4e3e-b18c-f869917c50cb,...,0,0,0,0,0,0,0,0,0,0
1068,SVS,open,[{'entity_submitter_id': 'TCGA-13-0906-01A-01-...,TCGA-13-0906-01A-01-TS1.7be7e649-9db2-4a60-b12...,TCGA.13.0906,Biospecimen,,315044077,2e9988fa5644ccd92fcc669d9d369e24,d1217464-aa0f-4f9e-aeb0-9513886939c5,...,0,0,0,0,0,0,0,0,0,0
1069,SVS,open,[{'entity_submitter_id': 'TCGA-04-1348-01A-01-...,TCGA-04-1348-01A-01-TS1.ffb07f65-72b7-494c-abf...,TCGA.04.1348,Biospecimen,,109640757,1d974d73037217e9a9d97e08023d6eeb,019607b4-e183-46ee-b062-9abcbe54ceb5,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# Get unique values of the stratify column
from sklearn.model_selection import train_test_split

unique_values = all_data['submitter_id_x'].unique()

# Split unique values into train and test
train_ids, test_ids = train_test_split(all_data_mutated.index,
                                       test_size=0.2,
                                       random_state=42)

In [49]:
target_labels = all_data.columns[-35:]

In [50]:
genes_ovarian_cancer = [
    "BRCA1", "BRCA2", "TP53", "RAD51C", "RAD51D", "PALB2", "ATM", "CHEK2",
    "PTEN", "ARID1A"
]


In [51]:
input_labels = [f"emb_{i}" for i in range(1024 * 4)]

In [52]:
gene_mutations_relevant.shape

(465, 40)

In [72]:
(gene_mutations_relevant == 0).sum().values


array([ 80, 457, 459, 462, 460, 461, 460, 461, 456, 460, 447, 461, 457,
       462, 459, 460, 452, 454, 462, 462, 461, 462, 462, 462, 460, 461,
       462, 459, 462, 460, 458, 458, 462, 462, 441, 462, 462, 460, 459,
       459])

In [54]:
classification_labels = gene_mutations_relevant.columns

In [55]:
from sklearn.preprocessing import LabelEncoder

encode_p = LabelEncoder().fit_transform(all_data['submitter_id_x'])

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, RandomForestClassifier
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, accuracy_score, f1_score, classification_report, r2_score
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

In [59]:
# pca = PCA(n_components=700)

# pcaed = pca.fit_transform(all_data[input_labels])

# all_data_copy = all_data.copy()


In [60]:
train_data = all_data.loc[train_ids]
test_data = all_data.loc[test_ids]

In [61]:
train_data_class = all_data_mutated.loc[train_ids]
test_data_class = all_data_mutated.loc[test_ids]

In [62]:
X_train, X_test, y_train, y_test = train_data[input_labels], test_data[
    input_labels], train_data[target_labels], test_data[target_labels]

In [63]:
X_train, X_test = train_data_class[input_labels], test_data_class[input_labels]

In [64]:
y_class_train = train_data_class[classification_labels]
y_class_test = test_data_class[classification_labels]

In [74]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Example features and multilabel target
# X = [[0, 1], [1, 0], [0, 0], [1, 1]]  # Replace with your features
# y = [[1, 0, 1], [0, 1, 0], [1, 1, 1], [0, 0, 0]]  # Binary-encoded target

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Wrap the classifier
model = OneVsRestClassifier(LGBMClassifier(n_jobs=-1))
model.fit(scaler.fit_transform(X_train), y_class_train)

# Predict
y_pred = model.predict(scaler.transform(X_test))




[LightGBM] [Info] Number of positive: 704, number of negative: 152
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057434 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.822430 -> initscore=1.532898
[LightGBM] [Info] Start training from score 1.532898




[LightGBM] [Info] Number of positive: 12, number of negative: 844
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064952 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014019 -> initscore=-4.253246
[LightGBM] [Info] Start training from score -4.253246
[LightGBM] [Info] Number of positive: 10, number of negative: 846




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061003 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011682 -> initscore=-4.437934
[LightGBM] [Info] Start training from score -4.437934
[LightGBM] [Info] Number of positive: 4, number of negative: 852
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057396 seconds.
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.004673 -> initscore=-5.361292
[LightGBM] [Info] Start training from score -5.361292
[LightGBM] [Info] Number of positive: 9, number of negative: 847




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.058951 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010514 -> initscore=-4.544476
[LightGBM] [Info] Start training from score -4.544476




[LightGBM] [Info] Number of positive: 8, number of negative: 848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059362 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009346 -> initscore=-4.663439
[LightGBM] [Info] Start training from score -4.663439
[LightGBM] [Info] Number of positive: 8, number of negative: 848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061397 seconds.
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009346 -> initscore=-4.663439
[LightGBM] [Info] Start training from score -4.663439
[LightGBM] [Info] Number of positive: 8, number of negative: 848




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061709 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009346 -> initscore=-4.663439
[LightGBM] [Info] Start training from score -4.663439
[LightGBM] [Info] Number of positive: 17, number of negative: 839




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019860 -> initscore=-3.898997
[LightGBM] [Info] Start training from score -3.898997
[LightGBM] [Info] Number of positive: 7, number of negative: 849




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063870 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008178 -> initscore=-4.798149
[LightGBM] [Info] Start training from score -4.798149




[LightGBM] [Info] Number of positive: 31, number of negative: 825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060669 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036215 -> initscore=-3.281396
[LightGBM] [Info] Start training from score -3.281396
[LightGBM] [Info] Number of positive: 12, number of negative: 844




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065837 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014019 -> initscore=-4.253246
[LightGBM] [Info] Start training from score -4.253246
[LightGBM] [Info] Number of positive: 16, number of negative: 840
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064222 seconds.
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.018692 -> initscore=-3.960813
[LightGBM] [Info] Start training from score -3.960813




[LightGBM] [Info] Number of positive: 6, number of negative: 850
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.007009 -> initscore=-4.953477
[LightGBM] [Info] Start training from score -4.953477
[LightGBM] [Info] Number of positive: 9, number of negative: 847
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059231 seconds.
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010514 -> initscore=-4.544476
[LightGBM] [Info] Start training from score -4.544476
[LightGBM] [Info] Number of positive: 10, number of negative: 846
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057007 seconds.
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011682 -> initscore=-4.437934
[LightGBM] [Info] Start training from score -4.437934
[LightGBM] [Info] Number of positive: 26, number of negative: 830




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059836 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.030374 -> initscore=-3.463329
[LightGBM] [Info] Start training from score -3.463329




[LightGBM] [Info] Number of positive: 19, number of negative: 837
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060628 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.022196 -> initscore=-3.785385
[LightGBM] [Info] Start training from score -3.785385




[LightGBM] [Info] Number of positive: 7, number of negative: 849
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057752 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008178 -> initscore=-4.798149
[LightGBM] [Info] Start training from score -4.798149
[LightGBM] [Info] Number of positive: 4, number of negative: 852
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.056164 seconds.
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.004673 -> initscore=-5.361292
[LightGBM] [Info] Start training from score -5.361292




[LightGBM] [Info] Number of positive: 7, number of negative: 849
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057906 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008178 -> initscore=-4.798149
[LightGBM] [Info] Start training from score -4.798149




[LightGBM] [Info] Number of positive: 7, number of negative: 849
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072363 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008178 -> initscore=-4.798149
[LightGBM] [Info] Start training from score -4.798149
[LightGBM] [Info] Number of positive: 6, number of negative: 850
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065531 seconds.
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.007009 -> initscore=-4.953477
[LightGBM] [Info] Start training from score -4.953477
[LightGBM] [Info] Number of positive: 7, number of negative: 849




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008178 -> initscore=-4.798149
[LightGBM] [Info] Start training from score -4.798149
[LightGBM] [Info] Number of positive: 12, number of negative: 844
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057930 seconds.
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014019 -> initscore=-4.253246
[LightGBM] [Info] Start training from score -4.253246
[LightGBM] [Info] Number of positive: 6, number of negative: 850




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057815 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.007009 -> initscore=-4.953477
[LightGBM] [Info] Start training from score -4.953477
[LightGBM] [Info] Number of positive: 5, number of negative: 851




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059283 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005841 -> initscore=-5.136974
[LightGBM] [Info] Start training from score -5.136974
[LightGBM] [Info] Number of positive: 12, number of negative: 844
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059695 seconds.
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014019 -> initscore=-4.253246
[LightGBM] [Info] Start training from score -4.253246
[LightGBM] [Info] Number of positive: 2, number of negative: 854
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060760 seconds.
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002336 -> initscore=-6.056784
[LightGBM] [Info] Start training from score -6.056784




[LightGBM] [Info] Number of positive: 9, number of negative: 847
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061313 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010514 -> initscore=-4.544476
[LightGBM] [Info] Start training from score -4.544476
[LightGBM] [Info] Number of positive: 13, number of negative: 843




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.015187 -> initscore=-4.172018
[LightGBM] [Info] Start training from score -4.172018
[LightGBM] [Info] Number of positive: 11, number of negative: 845
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.062981 seconds.
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.012850 -> initscore=-4.341441
[LightGBM] [Info] Start training from score -4.341441




[LightGBM] [Info] Number of positive: 7, number of negative: 849
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066467 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008178 -> initscore=-4.798149
[LightGBM] [Info] Start training from score -4.798149




[LightGBM] [Info] Number of positive: 6, number of negative: 850
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060987 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.007009 -> initscore=-4.953477
[LightGBM] [Info] Start training from score -4.953477
[LightGBM] [Info] Number of positive: 40, number of negative: 816
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072589 seconds.
You can set `force_col_wise=true` to remove the overhead.




[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.046729 -> initscore=-3.015535
[LightGBM] [Info] Start training from score -3.015535
[LightGBM] [Info] Number of positive: 8, number of negative: 848




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009346 -> initscore=-4.663439
[LightGBM] [Info] Start training from score -4.663439
[LightGBM] [Info] Number of positive: 5, number of negative: 851




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005841 -> initscore=-5.136974
[LightGBM] [Info] Start training from score -5.136974




[LightGBM] [Info] Number of positive: 8, number of negative: 848
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.058010 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009346 -> initscore=-4.663439
[LightGBM] [Info] Start training from score -4.663439
[LightGBM] [Info] Number of positive: 12, number of negative: 844




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065609 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014019 -> initscore=-4.253246
[LightGBM] [Info] Start training from score -4.253246




[LightGBM] [Info] Number of positive: 11, number of negative: 845
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.056879 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1017285
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 4010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.012850 -> initscore=-4.341441
[LightGBM] [Info] Start training from score -4.341441




In [75]:
y_pred_proba = model.predict_proba(scaler.transform(X_test))




In [77]:
y_class_test.values.sum(axis=1)

array([1, 1, 0, 4, 4, 1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2,
       0, 0, 1, 1, 1, 1, 2, 2, 2, 1, 1, 0, 2, 1, 1, 0, 3, 2, 1, 3, 1, 1,
       5, 1, 1, 0, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2,
       1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1,
       2, 2, 2, 0, 0, 2, 1, 0, 1, 4, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2,
       2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 2, 2, 1, 0,
       2, 2, 1, 1, 3, 1, 1, 1, 0, 2, 0, 1, 1, 3, 2, 1, 2, 2, 2, 2, 1, 1,
       0, 3, 2, 1, 1, 5, 1, 2, 1, 4, 1, 2, 0, 1, 2, 2, 1, 1, 2, 1, 2, 1,
       2, 1, 1, 1, 2, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 2, 3, 0, 1, 1, 1, 1,
       2, 1, 2, 1, 1, 2, 2, 0, 3, 1, 1, 1, 1, 1, 2, 2, 2])

In [78]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report


y_pred = (y_pred_proba[:, :] > 1e-01).astype(int)
precision = precision_score(y_class_test.values, y_pred, average='samples')
recall = recall_score(y_class_test.values, y_pred, average='samples')
f1 = f1_score(y_class_test.values, y_pred, average='samples')

print(f"Precision (micro): {precision}")
print(f"Recall (micro): {recall}")
print(f"F1 Score (micro): {f1}")


Precision (micro): 0.8255813953488372
Recall (micro): 0.6731782945736435
F1 Score (micro): 0.7167441860465117


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [79]:
from sklearn.metrics import roc_auc_score

In [80]:
for i in range(39):
    try:
        print(roc_auc_score(y_class_test.values[:, i], y_pred_proba[:, i]))
        print(y_class_test.values[:, i].sum())
    except:
        continue

0.6944444444444444
179
0.5466666666666666
5
0.3075117370892019
2
0.3915094339622641
3
0.7065390749601276
6
nan
0
0.36163522012578614
3
0.795774647887324
2
0.6255924170616114
4
0.43838862559241704
4
0.5694444444444444
8
0.7816901408450704
2
0.48130841121495327
1
nan
0
0.789308176100629
3
0.6338028169014085
2
0.511737089201878
2
0.565390749601276
6
0.9906542056074766
1
0.6713615023474178
2
0.9112149532710281
1
0.780373831775701
1
0.11971830985915494
2
0.33098591549295775
2
0.31690140845070425
2
0.5220125786163522
3
0.8317757009345794
1
0.6431924882629108
2
0.7885714285714286
5
0.5367298578199052
4
0.11214953271028039
1
0.580952380952381
5
0.985981308411215
1
nan
0
0.5980810234541578
14
nan
0
0.9579439252336449
1
0.8215962441314554
2
0.21361502347417838
2




In [None]:
print(classification_report(y_class_test, y_pred))

In [None]:
from sklearn.metrics import hamming_loss

# Example
hamming = hamming_loss(y_class_test, y_pred)
print(f"Hamming Loss: {hamming}")


In [None]:
from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import numpy as np

X, y1 = make_classification(n_samples=10,
                            n_features=100,
                            n_informative=30,
                            n_classes=3,
                            random_state=1)
y2 = shuffle(y1, random_state=1)
y3 = shuffle(y1, random_state=2)
Y = np.vstack((y1, y2, y3)).T
n_samples, n_features = X.shape  # 10,100
n_outputs = Y.shape[1]  # 3
n_classes = 3
forest = RandomForestClassifier(random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=2)
multi_target_forest.fit(X, Y).predict(X)

In [None]:
# for i in range(len(target_labels)):
model = RandomForestRegressor(n_jobs=-1, n_estimators=100)
model.fit(X_train, y_train.values.astype(float))

# Make predictions
y_pred = model.predict(X_test)
# print(mean_squared_error(y_test.flatten(), y_pred.flatten()))
print(r2_score(y_test, y_pred))

In [None]:
y_test.shape

In [197]:
best_feats_idx = np.argsort(model.feature_importances_, )[-1500:]

In [198]:
num_feats = 1500

In [199]:
# from sklearn.feature_selection import RFE
# from sklearn.ensemble import ExtraTreesRegressor

# rfe_selector = RFE(estimator=RandomForestRegressor(n_jobs=-1),
#                    n_features_to_select=num_feats,
#                    step=500,
#                    verbose=5)
# rfe_selector.fit(X_train, y_train)
# rfe_support = rfe_selector.get_support()
# # rfe_feature = X[corrs.dropna().index].loc[:,rfe_support].columns.tolist()
# # print(str(len(rfe_feature)), 'selected features')

In [None]:
rfe_selector.__dict__


In [None]:
rfe_support.sum()

In [202]:
params_opt = {
    'n_estimators': 800,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    # 'max_features': 'sqrt'
    'max_depth': 15,
    'random_state': 42
}

In [None]:
# for i in range(len(target_labels)):
model2 = RandomForestRegressor(n_jobs=-1, **params_opt)
model2.fit(X_train.iloc[:, best_feats_idx], y_train.values.astype(float))

# Make predictions
y_pred = model2.predict(X_test.iloc[:, best_feats_idx])
# print(mean_squared_error(y_test.flatten(), y_pred.flatten()))
print(r2_score(y_test, y_pred))

In [None]:
RandomForestRegressor?

In [None]:
# from catboost import CatBoostRegressor

# cb_reg = CatBoostRegressor(
#     objective='MultiRMSE',
#     verbose=0,
#     learning_rate=0.1,
#     n_estimators=10,
#     #    num_boost_round=10
# )
# cb_reg.fit(X_train.values, y_train)
# cb_pred = cb_reg.predict(X_test)
# # pd.DataFrame(cb_pred, columns=['Y1', 'Y2'])

In [None]:
CatBoostRegressor?

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Define parameter grid for RandomForestRegressor
param_grid = {
    'n_estimators': [100, 150, 200, 500, 800],
    'max_depth': [None, 10, 15, 20, 30, 40],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 10],
    'max_features': ['sqrt']
    # 'max_features': [None, 'sqrt', 'log2']
}

# Initialize the RandomForestRegressor
rf = RandomForestRegressor(n_jobs=-1)

# RandomizedSearchCV for lightweight optimization
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=40,  # Number of parameter settings to sample
    cv=3,  # Number of folds for cross-validation
    scoring='r2',
    verbose=1,
    random_state=42,
    n_jobs=7)

# Fit the random search to the data
random_search.fit(X_train.iloc[:, best_feats_idx],
                  y_train.values)  # Flatten y_train for regression

# Best parameters and evaluation
best_model = random_search.best_estimator_
print(f"Best Parameters: {random_search.best_params_}")

# Evaluate the best model
y_pred = best_model.predict(X_test.iloc[:, best_feats_idx])
print(f"R2 Score: {r2_score(y_test, y_pred)}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")


In [None]:
y_train.shape

In [None]:
y_pred_flat.shape

In [78]:
# import lightgbm as lgb
# from sklearn.metrics import r2_score
# import numpy as np

# correls = []
# # Assuming y_train and y_test are numpy arrays with shape (n_samples, n_targets)
# for i in range(y_train.shape[1]):  # Loop over each target variable
#     # Prepare the LightGBM dataset
#     train_data = lgb.Dataset(X_train, label=y_train.values[:, i])
#     test_data = lgb.Dataset(X_test,
#                             label=y_test.values[:, i],
#                             reference=train_data)

#     # Set parameters
#     params = {
#         'objective': 'regression',
#         'metric': 'rmse',
#         'boosting_type': 'gbdt',
#         'n_jobs': -1,
#         'verbosity': -1,
#         # 'num_leaves': 500,  # Large number of leaves to increase complexity
#         # 'max_depth': -1,  # No depth limit
#         # 'min_data_in_leaf': 1,  # Allow very small leaves
#         # 'min_child_samples': 1,  # Reduce the minimum data per child
#         # 'lambda_l1': 0,  # No L1 regularization
#         # 'lambda_l2': 0,  # No L2 regularization
#         # 'learning_rate': 0.1,  # Larger learning rate for faster overfitting
#         # 'feature_fraction': 1.0,  # Use all features
#         # 'bagging_fraction': 1.0,  # Use all data
#         # 'bagging_freq': 0,  # No bagging
#     }

#     # Train the model
#     model = lgb.train(
#         params,
#         train_data,
#         num_boost_round=100,
#         valid_sets=[train_data, test_data],
#         valid_names=['train', 'valid'],
#         #   early_stopping_rounds=10,
#         # verbose_eval=False
#     )

#     # Make predictions
#     y_pred = model.predict(X_test)

#     # Evaluate the model
#     r2 = r2_score(y_test.values[:, i], y_pred)
#     print(f"R2 score for target {i}: {r2}")
#     print(
#         np.corrcoef(y_test.values[:, i].flatten().astype(float),
#                     y_pred.flatten()))

#     correls.append(
#         np.corrcoef(y_test.values[:, i].flatten().astype(float),
#                     y_pred.flatten()))


In [206]:
correls = []
for i in range(len(target_labels)):
    correls.append(
        np.corrcoef(y_test.values[:, i].flatten().astype(float),
                    y_pred[:, i].flatten())[0, 1])

In [None]:
sorted(correls)[::-1]

In [208]:
# for i in range(len(target_labels)):
#     model = RandomForestRegressor()
#     model.fit(X_train, y_train.values[:, i])

#     # Make predictions
#     y_pred = model.predict(X_test)
#     # print(mean_squared_error(y_test.flatten(), y_pred.flatten()))
#     print(r2_score(y_test.values[:, i], y_pred))
#     print(
#         np.corrcoef(y_test.values[:, i].flatten().astype(float),
#                     y_pred.flatten())[0, 1])

In [None]:
y_test.values.flatten()

In [None]:
y_pred.flatten()

In [None]:
y_test.shape

In [212]:
def find_vector_duplicates(array1, array2):
    # Convert each vector to a tuple for hashable comparison
    set1 = {tuple(vec) for vec in array1}
    duplicates = [vec for vec in array2 if tuple(vec) in set1]
    return duplicates

In [None]:
gene_names.iloc[genes][0].values

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data extracted from the plot (example data, replace with actual values)
sort_idx = np.argsort(correls)  #[::-1]
genes = np.array(target_labels)[sort_idx]

# Plotting
y_pos = np.arange(len(genes))

plt.figure(figsize=(10, 8))
plt.barh(y_pos - 0.2,
         np.array(correls)[sort_idx],
         height=0.4,
         label='Pearson',
         color='blue',
         alpha=0.7)
# plt.barh(y_pos + 0.2, spearman, height=0.4, label='Spearman', color='orange', alpha=0.7)

plt.yticks(y_pos, gene_names.iloc[genes][0].values)
plt.xlabel("Correlation Coefficient")
plt.title("Correlation Pearson")
plt.axvline(0, color='black', linewidth=0.8, linestyle='--')
plt.legend()
plt.tight_layout()
plt.grid()
plt.show()


In [None]:
y_test.shape

In [None]:
np.corrcoef(y_test.T, y_pred.T)

In [None]:
pca_vecs = pca.fit_transform(latent_vecs)

In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
pca.get_covariance()

In [None]:
np.corrcoef(pca_vecs[:, 0], [pca_vecs[:, 1]])

In [None]:
plt.scatter(pca_vecs[:, 0], [pca_vecs[:, 1]])
plt.xlim(-0.5, 0.5)
plt.ylim(-0.5, 0.5)