In [21]:
import pandas as pd
import numpy as np 
import pyarrow.parquet as pq

In [22]:
# file_path = '/protein-abundance-PMDA2025/all_organisms_filtered_without_M.musculus_KIDNEY.parquet'
file_path = '/Users/morgensh/Documents/UZH/Roche PMDA/protein-abundance-PMDA2025/all_organisms_filtered_without_M.musculus_KIDNEY.parquet'
#schema = pq.read_schema(file_path)
#print(schema.names)
df = pd.read_parquet(file_path)


In [23]:
df['integrated_score'] = np.where(df['is_integrated'] == True, df['quality_score'], np.nan)
df['non_integrated_score'] = np.where(df['is_integrated'] == False, df['quality_score'], np.nan)

species_breakdown_updated = df.groupby('organism_name').agg(
    total_protein_entries=('EnsemblProteinID', 'count'),
    unique_proteins_name=('UniprotEntryName', 'nunique'),
    unique_proteins_id=('UniprotAccession', 'nunique'),
    avg_quality_score=('quality_score', 'mean'),
    avg_integrated_quality=('integrated_score', 'mean'),
    avg_non_integrated_quality=('non_integrated_score', 'mean')
)

species_breakdown_updated = species_breakdown_updated.sort_values(by='total_protein_entries', ascending=False)
score_cols = ['avg_quality_score', 'avg_integrated_quality', 'avg_non_integrated_quality']
species_breakdown_updated[score_cols] = species_breakdown_updated[score_cols].round(2)



In [24]:
print(species_breakdown_updated.head(5))

               total_protein_entries  unique_proteins_name  \
organism_name                                                
H.sapiens                    1797533                 19047   
M.musculus                    489864                 19439   
A.thaliana                    426081                 21866   
Taestivum                     247308                 38372   
Mmulatta                      118524                 12146   

               unique_proteins_id  avg_quality_score  avg_integrated_quality  \
organism_name                                                                  
H.sapiens                   18673              19.97                   29.83   
M.musculus                  19117              17.39                   25.40   
A.thaliana                  21763              11.64                   16.54   
Taestivum                   38372              17.27                   22.55   
Mmulatta                    12146               7.82                   10.11   

    

In [25]:
colnames = df.columns.tolist()
colnames

['dataset_id',
 'dataset_name',
 'dataset_description',
 'organism_id',
 'organism_name',
 'sample_organ',
 'is_integrated',
 'quality_score',
 'coverage',
 'publication_link',
 'publication_year',
 'original_filename',
 'string_external_id',
 'EnsemblProteinID',
 'mapped_from',
 'UniprotEntryName',
 'UniprotAccession',
 'ProteinName',
 'GeneSymbol',
 'Sequence',
 'uniprot_status',
 'nog_id',
 'abundance',
 'EnsemblTranscript',
 'EnsemblGene',
 'integrated_score',
 'non_integrated_score']

# Protein Network Embeddings --> Abundance Prediction
- We will use the SPACE network embeddings of ortholog info and protein-protein interaction (embedding dim 512)
- We will then train a inference head (regression?) to predict abundance on whole-organism data



In [None]:
import h5py

filename = "data/protein.network.embeddings.v12.0.h5"

In [27]:
# filepath: /Users/morgensh/Documents/UZH/Roche PMDA/protein-abundance-PMDA2025/PPI_to_abundance.ipynb
with h5py.File(filename, 'r') as f:
    print("Root keys:", list(f.keys()))
    meta_keys = f['metadata'].attrs.keys()
    for key in meta_keys:
        print(key, f['metadata'].attrs[key])


Root keys: ['metadata', 'species']
embedding_dim 512
n_species 1322
precision 16
total_proteins 21212112


In [28]:
def print_all_datasets(name, obj):
    if isinstance(obj, h5py.Dataset):
        print(name)

with h5py.File(filename, 'r') as f:
    f.visititems(print_all_datasets)
    

species/1001832/embeddings
species/1001832/proteins
species/1001833/embeddings
species/1001833/proteins
species/1001938/embeddings
species/1001938/proteins
species/10020/embeddings
species/10020/proteins
species/10029/embeddings
species/10029/proteins
species/1003232/embeddings
species/1003232/proteins
species/10036/embeddings
species/10036/proteins
species/10047/embeddings
species/10047/proteins
species/100787/embeddings
species/100787/proteins
species/100816/embeddings
species/100816/proteins
species/10089/embeddings
species/10089/proteins
species/10090/embeddings
species/10090/proteins
species/10093/embeddings
species/10093/proteins
species/10096/embeddings
species/10096/proteins
species/10103/embeddings
species/10103/proteins
species/101091/embeddings
species/101091/proteins
species/101127/embeddings
species/101127/proteins
species/10116/embeddings
species/10116/proteins
species/10141/embeddings
species/10141/proteins
species/10160/embeddings
species/10160/proteins
species/1016849/

In [29]:
with h5py.File(filename, 'r') as f:
    species_group = f['species']
    for species_id in species_group:
        group = species_group[species_id]
        print(f"Species ID: {species_id}")
        # Print group attributes
        for attr in group.attrs:
            print(f"  Attribute: {attr} = {group.attrs[attr]}")
        # Print datasets in the group
        for ds in group:
            print(f"  Dataset: {ds}")
        print("-" * 40)
        

Species ID: 1001832
  Attribute: n_proteins = 11156
  Dataset: embeddings
  Dataset: proteins
----------------------------------------
Species ID: 1001833
  Attribute: n_proteins = 11701
  Dataset: embeddings
  Dataset: proteins
----------------------------------------
Species ID: 1001938
  Attribute: n_proteins = 12244
  Dataset: embeddings
  Dataset: proteins
----------------------------------------
Species ID: 10020
  Attribute: n_proteins = 16902
  Dataset: embeddings
  Dataset: proteins
----------------------------------------
Species ID: 10029
  Attribute: n_proteins = 23874
  Dataset: embeddings
  Dataset: proteins
----------------------------------------
Species ID: 1003232
  Attribute: n_proteins = 4189
  Dataset: embeddings
  Dataset: proteins
----------------------------------------
Species ID: 10036
  Attribute: n_proteins = 18250
  Dataset: embeddings
  Dataset: proteins
----------------------------------------
Species ID: 10047
  Attribute: n_proteins = 21115
  Dataset: e

In [18]:
all_protein_ids = []

with h5py.File(filename, 'r') as f:
    def collect_proteins(name, obj):
        if isinstance(obj, h5py.Dataset) and name.endswith('/proteins'):
            protein_ids = obj[:]
            # If stored as bytes, decode to strings
            protein_ids = [pid.decode('utf-8') for pid in protein_ids]
            all_protein_ids.extend(protein_ids)
    f.visititems(collect_proteins)

print(f"Total protein IDs: {len(all_protein_ids)}")
print(all_protein_ids[:10])  # Print first 10 as a check

Total protein IDs: 21212112
['1001832.A0A1Y2WP95', '1001832.A0A1Y2WPA2', '1001832.A0A1Y2WLE4', '1001832.A0A1Y2X4S7', '1001832.A0A1Y2WY14', '1001832.A0A1Y2WUY6', '1001832.A0A1Y2XC72', '1001832.A0A1Y2WT67', '1001832.A0A1Y2X149', '1001832.A0A1Y2X2R6']


Okay, indeed all the files are names after proteins or embeddings
Their structure is:
```
species/1001832/embeddings
species/1001832/proteins
species/1001833/embeddings
species/1001833/proteins
...
```

Dataset facts:
- Root keys: ['metadata', 'species']
- embedding_dim 512
- n_species 1322
- precision 16
- total_proteins 21212112

The number of proteins is confirmed, and they are listed as STRING IDs:
`Total protein IDs: 21212112
['1001832.A0A1Y2WP95', '1001832.A0A1Y2WPA2', '1001832.A0A1Y2WLE4', '1001832.A0A1Y2X4S7', '1001832.A0A1Y2WY14', '1001832.A0A1Y2WUY6', '1001832.A0A1Y2XC72', '1001832.A0A1Y2WT67', '1001832.A0A1Y2X149', '1001832.A0A1Y2X2R6']`

Each species is including this info/structure:
```
Species ID: 1001832
  Attribute: n_proteins = 11156
  Dataset: embeddings
  Dataset: proteins
----------------------------------------
```

Each of the proteins/embeddings looks like:
```
Species ID: 1001832
  Dataset: embeddings
    Shape: (11156, 512)
    Dtype: float16
    First 3 entries: [[ 0.0181   -0.01576   0.00916  ... -0.003796  0.01107  -0.01339 ]
 [-0.01518   0.0331    0.05798  ... -0.01978  -0.0138   -0.07825 ]
 [ 0.00173  -0.02208   0.01195  ...  0.000673 -0.01116  -0.002357]]
  
  Dataset: proteins
    Shape: (11156,)
    Dtype: object
    First 3 entries: [b'1001832.A0A1Y2WP95' b'1001832.A0A1Y2WPA2' b'1001832.A0A1Y2WLE4']
----------------------------------------
```

In [19]:
with h5py.File(filename, 'r') as f:
    species_group = f['species']
    first_species_id = list(species_group.keys())[0]
    group = species_group[first_species_id]
    print(f"Species ID: {first_species_id}")
    for ds_name in group:
        ds = group[ds_name]
        print(f"  Dataset: {ds_name}")
        print(f"    Shape: {ds.shape}")
        print(f"    Dtype: {ds.dtype}")
        # Print attributes of the dataset
        for attr in ds.attrs:
            print(f"    Attribute: {attr} = {ds.attrs[attr]}")
        print(f"    First 3 entries: {ds[:3]}")
    print("-" * 40)

Species ID: 1001832
  Dataset: embeddings
    Shape: (11156, 512)
    Dtype: float16
    First 3 entries: [[ 0.0181   -0.01576   0.00916  ... -0.003796  0.01107  -0.01339 ]
 [-0.01518   0.0331    0.05798  ... -0.01978  -0.0138   -0.07825 ]
 [ 0.00173  -0.02208   0.01195  ...  0.000673 -0.01116  -0.002357]]
  Dataset: proteins
    Shape: (11156,)
    Dtype: object
    First 3 entries: [b'1001832.A0A1Y2WP95' b'1001832.A0A1Y2WPA2' b'1001832.A0A1Y2WLE4']
----------------------------------------


In [20]:
with h5py.File(filename, 'r') as f:
    species_group = f['species']
    first_species_id = list(species_group.keys())[0]
    group = species_group[first_species_id]
    print(f"Species ID: {first_species_id}")
    for ds_name in group:
        ds = group[ds_name]
        print(f"  Dataset: {ds_name}")
        print(f"    Shape: {ds.shape}")
        print(f"    Dtype: {ds.dtype}")
        # Print attributes of the dataset
        for attr in ds.attrs:
            print(f"    Attribute: {attr} = {ds.attrs[attr]}")
        # Optionally, print a sample of the data
        print(f"    First 3 entries: {ds[:3]}")
    print("-" * 40)

Species ID: 1001832
  Dataset: embeddings
    Shape: (11156, 512)
    Dtype: float16
    First 3 entries: [[ 0.0181   -0.01576   0.00916  ... -0.003796  0.01107  -0.01339 ]
 [-0.01518   0.0331    0.05798  ... -0.01978  -0.0138   -0.07825 ]
 [ 0.00173  -0.02208   0.01195  ...  0.000673 -0.01116  -0.002357]]
  Dataset: proteins
    Shape: (11156,)
    Dtype: object
    First 3 entries: [b'1001832.A0A1Y2WP95' b'1001832.A0A1Y2WPA2' b'1001832.A0A1Y2WLE4']
----------------------------------------


In [None]:

# NOTE: this code collects all protein embeddings from the HDF5 file and stores them in a list. It runs extremely slow on large files, so it is not recommended for production use.
# with h5py.File(filename, 'r') as f:
#     species_group = f['species']
#     first_species_id = list(species_group.keys())[0]
#     group = species_group[first_species_id]
#     print(f"Species ID: {first_species_id}")
    
#     for species_id in f["/species"]:
#         group = f["/species"][species_id]
#         if "proteins" in group and "embeddings" in group:
#             proteins = [p.decode("utf-8") for p in group["proteins"][:]]
#             embeddings = group["embeddings"][:]
#             for protein, emb in zip(proteins, embeddings):
#                 all_protein_embeddings.append([protein] + emb.tolist())
                
#     print("-" * 40)

Species ID: 1001832


In [None]:
# get the list of STRING protein IDs from the HDF5 file and map them to the corresponding string_external_id in /data/DEF_PROCESSED_all_organisms_filtered_without_M.musculus_KIDNEY.csv

# relevant_protein_ids = "data/DEF_PROCESSED_all_organisms_filtered_without_M.musculus_KIDNEY.csv"
# all_protein_embeddings = []

# with h5py.File(filename, 'r') as f:

In [2]:
import h5py
import pandas as pd

# Load the CSV and extract the relevant STRING IDs
df_ids = pd.read_csv("data/DEF_PROCESSED_all_organisms_filtered_without_M.musculus_KIDNEY.csv")
relevant_ids = set(df_ids['string_external_id'])

print(f"Loaded {len(relevant_ids)} relevant STRING IDs")



Loaded 317251 relevant STRING IDs


In [6]:
relevant_ids

{'7955.ENSDARP00000121235',
 '7227.FBpp0082966',
 '5833.PF14_0067',
 '3847.GLYMA20G32870.2',
 '4565.Traes_4DS_D4565625E.1',
 '1286170.RORB6_21505',
 '4565.Traes_4DS_06D496233.1',
 '5691.EAN78937',
 '10090.ENSMUSP00000031091',
 '9031.ENSGALP00000007506',
 '4577.GRMZM2G112228_P01',
 '7227.FBpp0076003',
 '64091.VNG_2380H',
 '44689.DDB0214884',
 '3055.EDP04564',
 '10116.ENSRNOP00000001913',
 '10116.ENSRNOP00000063450',
 '7955.ENSDARP00000044811',
 '9913.ENSBTAP00000035821',
 '9913.ENSBTAP00000049614',
 '7955.ENSDARP00000094247',
 '10090.ENSMUSP00000047065',
 '9544.ENSMMUP00000016917',
 '9606.ENSP00000265081',
 '3847.GLYMA01G43340.2',
 '226186.BT_1313',
 '10116.ENSRNOP00000019301',
 '246196.MSMEI_3586',
 '9913.ENSBTAP00000001214',
 '511145.b0158',
 '9913.ENSBTAP00000053570',
 '5691.EAN80560',
 '4565.Traes_7AS_21C07E720.1',
 '4565.Traes_2BL_8BDCCE9F8.1',
 '257313.BP3085',
 '9031.ENSGALP00000018680',
 '4565.Traes_5DL_C381C2BE4.2',
 '10090.ENSMUSP00000042602',
 '9031.ENSGALP00000027744',
 '581

In [9]:
relevant_ids_test = list(relevant_ids)[:5]   # For testing purposes, limit to 5 IDs

In [11]:

# import h5py
# import pandas as pd

filename = 'data/protein.network.embeddings.v12.0.h5'
rows = []
relevant_ids_test = list(relevant_ids)[:5]  # Use the relevant IDs loaded from the CSV
with h5py.File(filename, 'r') as f:
    species_group = f['species']
    for species_id in species_group:
        group = species_group[species_id]
        if 'proteins' in group and 'embeddings' in group:
            protein_ids = group['proteins'][:]
            embeddings = group['embeddings'][:]
            # Decode protein IDs
            protein_ids = [pid.decode('utf-8') for pid in protein_ids]
            # Build a mapping from protein_id to embedding index for fast lookup
            pid_to_idx = {pid: idx for idx, pid in enumerate(protein_ids)}
            # Find intersection with relevant_ids
            matching_ids = set(relevant_ids_test).intersection(pid_to_idx.keys())
            for pid in matching_ids:
                idx = pid_to_idx[pid]
                emb = embeddings[idx]
                rows.append([species_id, pid] + emb.tolist())

# Build DataFrame
embedding_dim = len(rows[0]) - 2 if rows else 0
columns = ['species_id', 'protein_id'] + [f'emb_{i}' for i in range(embedding_dim)]
df = pd.DataFrame(rows, columns=columns)

print(df.head())

  species_id               protein_id     emb_0     emb_1     emb_2     emb_3  \
0       7955  7955.ENSDARP00000121235 -0.005043  0.007671  0.029221  0.019775   

      emb_4     emb_5     emb_6     emb_7  ...   emb_502   emb_503   emb_504  \
0  0.033539 -0.011253 -0.035156  0.046326  ...  0.029785 -0.014412 -0.009239   

    emb_505   emb_506   emb_507   emb_508   emb_509   emb_510   emb_511  
0 -0.014931 -0.039703  0.013649 -0.048889 -0.001273 -0.004768  0.034515  

[1 rows x 514 columns]


_______________________________________

## Combining info from SPACE and PaxDB
- There are 1000+ org in SPACE, but only 140 in PaxDB, so there's some filtering to do (on a protein level, though)
- This will be the dataset we use to train the abundance prediction model (STRING ID + 512 filtered embeddings --> abundance), SPACE-->PaxDB

In [15]:
# For relevant_ids, we should find the related information in the abundance data
# Load the abundance data
abundance_data = pd.read_csv("data/Organ_organsismDEF_PROCESSED_all_organisms_filtered_without_M.musculus_KIDNEY.csv")

In [16]:
#filter abundance_data to only include relevant_ids
abundance_data_filtered = abundance_data[abundance_data['string_external_id'].isin(relevant_ids)]
abundance_data_filtered

Unnamed: 0.1,Unnamed: 0,organism_id,organism_name,sample_organ,is_integrated,quality_score,coverage,string_external_id,EnsemblProteinID,UniprotEntryName,UniprotAccession,GeneSymbol,Sequence,nog_id,abundance,EnsemblTranscript,EnsemblGene,integrated_score,non_integrated_score,Sequence_len
0,74,7955,D.rerio,WHOLE_ORGANISM,True,31.8,57.0,7955.ENSDARP00000104955,ENSDARP00000104955,A0A024B5K5_DANRE,A0A024B5K5,"['ece2b', 'ECE2', 'si:dkey-83h2.6']",MSVALQDLRNNMSNYKRATFEEEDGTDVPVDGAISPDSVEVGFRKG...,,2.470,,,31.8,,765
1,94,9606,H.sapiens,GALLBLADDER,True,26.8,58.0,9606.ENSP00000363412,ENSP00000363412,A0A024R161_HUMAN,A0A024R161,['DNAJC25-GNG10'],MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...,NOG038474,13.400,ENST00000374294,ENSG00000244115,26.8,,153
2,96,9606,H.sapiens,HEART,True,33.6,68.0,9606.ENSP00000363412,ENSP00000363412,A0A024R161_HUMAN,A0A024R161,['DNAJC25-GNG10'],MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...,NOG038474,1.840,ENST00000374294,ENSG00000244115,33.6,,153
3,97,9606,H.sapiens,KIDNEY,True,31.4,59.0,9606.ENSP00000363412,ENSP00000363412,A0A024R161_HUMAN,A0A024R161,['DNAJC25-GNG10'],MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...,NOG038474,3.030,ENST00000374294,ENSG00000244115,31.4,,153
4,98,9606,H.sapiens,LIVER,True,30.0,80.0,9606.ENSP00000363412,ENSP00000363412,A0A024R161_HUMAN,A0A024R161,['DNAJC25-GNG10'],MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...,NOG038474,5.720,ENST00000374294,ENSG00000244115,30.0,,153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178597,1818525,9606,H.sapiens,WHOLE_ORGANISM,True,35.5,99.0,9606.ENSP00000361400,ENSP00000361400,X6REF7_HUMAN,X6REF7,['SFTPA2'],MWLCPLALTLILMAASGAACEVKDVCVGSPGIPGTPGSHGLPGRDG...,NOG033816,3.460,ENST00000372325,ENSG00000185303,35.5,,158
178598,1818526,7955,D.rerio,WHOLE_ORGANISM,True,31.8,57.0,7955.ENSDARP00000088918,ENSDARP00000088918,Z4YHZ5_DANRE,Z4YHZ5,"['si:ch211-265g22.3', 'zgc:152670']",MAMKELTLFVVCSWFVSITEQEFFSSVDQVSQLFNEEENLLDTFSL...,NOG262459,0.058,,,31.8,,490
178599,1818529,10090,M.musculus,WHOLE_ORGANISM,True,41.0,90.0,10090.ENSMUSP00000023387,ENSMUSP00000023387,Z4YJE9_MOUSE,Z4YJE9,"['Qtrt2', 'QTRT2', 'Qtrtd1']",MMKLSLIKVVNGCRLGKIQNLGKAGDCTVDIPGCLLYTRTGSAPHL...,NOG032670,21.700,ENSMUST00000023387,ENSMUSG00000022704,41.0,,345
178600,1818556,10090,M.musculus,WHOLE_ORGANISM,True,41.0,90.0,10090.ENSMUSP00000134056,ENSMUSP00000134056,Z4YNA9_MOUSE,Z4YNA9,['AB124611'],MGGEMPWTILLFASGSLAIPAPSISLVPPYPSSHEDPIYISCTAPG...,,1.450,ENSMUST00000173769,ENSMUSG00000057191,41.0,,196


In [21]:
abundance_data_filtered['string_external_id'].unique().shape[0]  # Count unique IDs in the filtered abundance data

74893

In [145]:
# Let's build an MLP regression model to predict the abundance based on the embeddings
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error  


In [23]:
file_network_embeddings = pd.read_csv("data/matched_protein_network_embeddings.csv", index_col=0)
file_seq_embeddings = pd.read_csv("data/matched_protein_sequence_embeddings.csv", index_col=0)

In [25]:
import h5py

filename = 'data/protein.network.embeddings.v12.0.h5'
# target_id = '7955.ENSDARP00000104955'
target_id = '10090.ENSMUSP00000134056'  # Example target ID to search for
embedding = None

with h5py.File(filename, 'r') as f:
    species_group = f['species']
    for species_id in species_group:
        group = species_group[species_id]
        if 'proteins' in group and 'embeddings' in group:
            protein_ids = group['proteins'][:]
            protein_ids = [pid.decode('utf-8') for pid in protein_ids]
            if target_id in protein_ids:
                idx = protein_ids.index(target_id)
                embedding = group['embeddings'][idx]
                print(f"Found in species {species_id}:")
                print(embedding)
                break

if embedding is None:
    print("Protein ID not found.")

Protein ID not found.


In [34]:
filename = '/Users/morgensh/Documents/UZH/Roche PMDA/protein-abundance-PMDA2025/test_masked_integrated_rows_M.musculus_KIDNEY.csv'
target_ids = pd.read_csv("/Users/morgensh/Documents/UZH/Roche PMDA/protein-abundance-PMDA2025/test_masked_integrated_rows_M.musculus_KIDNEY.csv", index_col=0)


In [35]:
target_ids 

Unnamed: 0_level_0,organism_name,sample_organ,is_integrated,EnsemblProteinID,UniprotEntryName,UniprotAccession,ProteinName,GeneSymbol,Sequence,nog_id,abundance,EnsemblTranscript,EnsemblGene
organism_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10090,M.musculus,KIDNEY,True,ENSMUSP00000026459,ATPB_MOUSE,P56480,ATP synthase F(1) complex catalytic subunit be...,"['Atp5f1b', 'Atp5b']",MLSLVGRVASASASGALRGLSPSAALPQAQLLLRAAPAGVHPARDY...,NOG045504,,ENSMUST00000026459,ENSMUSG00000025393
10090,M.musculus,KIDNEY,True,ENSMUSP00000015800,HSP7C_MOUSE,P63017,Heat shock cognate 71 kDa protein (EC 3.6.4.10...,"['Hspa8', 'Hsc70', 'Hsc73']",MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...,NOG047319,,ENSMUST00000015800,ENSMUSG00000015656
10090,M.musculus,KIDNEY,True,ENSMUSP00000031314,ALBU_MOUSE,P07724,Albumin,"['Alb', 'Alb-1', 'Alb1']",MKWVTFLLLLFVSGSAFSRGVFRREAHKSEIAHRYNDLGEQHFKGL...,NOG009613,,ENSMUST00000031314,ENSMUSG00000029368
10090,M.musculus,KIDNEY,True,ENSMUSP00000023934,HBB1_MOUSE,P02088,Hemoglobin subunit beta-1 (Beta-1-globin) (Hem...,"['Hbb-b1', 'Hbb-bs']",MVHLTDAEKAAVSCLWGKVNSDEVGGEALGRLLVVYPWTQRYFDSF...,NOG037124,,ENSMUST00000023934,ENSMUSG00000052305
10090,M.musculus,KIDNEY,True,ENSMUSP00000029987,ALDOB_MOUSE,Q91Y97,Fructose-bisphosphate aldolase B (EC 4.1.2.13)...,"['Aldob', 'Aldo2']",MAHRFPALTPEQKKELSEIAQRIVANGKGILAADESVGTMGNRLQR...,NOG047027,,ENSMUST00000029987,ENSMUSG00000028307
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10090,M.musculus,KIDNEY,True,ENSMUSP00000046204,HYDIN_MOUSE,Q80W93,Hydrocephalus-inducing protein (Protein Hy-3),"['Hydin', 'Hy3']",MTLKIKCVANYIKEKIPNVLFLCDPEARLQQLTASVPLCEQRKYFK...,NOG008962,,ENSMUST00000043141,ENSMUSG00000059854
10090,M.musculus,KIDNEY,True,ENSMUSP00000137123,RYR1_MOUSE,E9PZQ0,Ryanodine receptor 1 (RYR-1) (RyR1) (Skeletal ...,['Ryr1'],MGDGGGEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNR...,NOG047076,,ENSMUST00000179893,ENSMUSG00000030592
10090,M.musculus,KIDNEY,True,ENSMUSP00000092571,E9Q0T8_MOUSE,E9Q0T8,Dynein axonemal heavy chain 7 (Axonemal beta d...,"['Dnah7a', 'Dnahc7a', 'Dnahc7b']",MSSKKDKLSTKGKSKTPVRFLPQLPMDKLSSKEKVKLPTTVLPQLS...,NOG040644,,ENSMUST00000094964,ENSMUSG00000096141
10090,M.musculus,KIDNEY,True,ENSMUSP00000140430,A0A087WR13_MOUSE,A0A087WR13,Dynein axonemal heavy chain 7 (Axonemal beta d...,['Dnah7c'],MSSKKDKLSTKGKSKMPARFLPQLPMDKLSSKEKAKLPTTVLPQLT...,NOG040644,,ENSMUST00000189749,ENSMUSG00000101337


In [39]:
target_ids['string_external_id'] = target_ids['UniprotAccession'].astype(str) + '.' + target_ids['EnsemblProteinID'].astype(str)

In [40]:
target_ids

Unnamed: 0_level_0,organism_name,sample_organ,is_integrated,EnsemblProteinID,UniprotEntryName,UniprotAccession,ProteinName,GeneSymbol,Sequence,nog_id,abundance,EnsemblTranscript,EnsemblGene,string_external_id
organism_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10090,M.musculus,KIDNEY,True,ENSMUSP00000026459,ATPB_MOUSE,P56480,ATP synthase F(1) complex catalytic subunit be...,"['Atp5f1b', 'Atp5b']",MLSLVGRVASASASGALRGLSPSAALPQAQLLLRAAPAGVHPARDY...,NOG045504,,ENSMUST00000026459,ENSMUSG00000025393,P56480.ENSMUSP00000026459
10090,M.musculus,KIDNEY,True,ENSMUSP00000015800,HSP7C_MOUSE,P63017,Heat shock cognate 71 kDa protein (EC 3.6.4.10...,"['Hspa8', 'Hsc70', 'Hsc73']",MSKGPAVGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...,NOG047319,,ENSMUST00000015800,ENSMUSG00000015656,P63017.ENSMUSP00000015800
10090,M.musculus,KIDNEY,True,ENSMUSP00000031314,ALBU_MOUSE,P07724,Albumin,"['Alb', 'Alb-1', 'Alb1']",MKWVTFLLLLFVSGSAFSRGVFRREAHKSEIAHRYNDLGEQHFKGL...,NOG009613,,ENSMUST00000031314,ENSMUSG00000029368,P07724.ENSMUSP00000031314
10090,M.musculus,KIDNEY,True,ENSMUSP00000023934,HBB1_MOUSE,P02088,Hemoglobin subunit beta-1 (Beta-1-globin) (Hem...,"['Hbb-b1', 'Hbb-bs']",MVHLTDAEKAAVSCLWGKVNSDEVGGEALGRLLVVYPWTQRYFDSF...,NOG037124,,ENSMUST00000023934,ENSMUSG00000052305,P02088.ENSMUSP00000023934
10090,M.musculus,KIDNEY,True,ENSMUSP00000029987,ALDOB_MOUSE,Q91Y97,Fructose-bisphosphate aldolase B (EC 4.1.2.13)...,"['Aldob', 'Aldo2']",MAHRFPALTPEQKKELSEIAQRIVANGKGILAADESVGTMGNRLQR...,NOG047027,,ENSMUST00000029987,ENSMUSG00000028307,Q91Y97.ENSMUSP00000029987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10090,M.musculus,KIDNEY,True,ENSMUSP00000046204,HYDIN_MOUSE,Q80W93,Hydrocephalus-inducing protein (Protein Hy-3),"['Hydin', 'Hy3']",MTLKIKCVANYIKEKIPNVLFLCDPEARLQQLTASVPLCEQRKYFK...,NOG008962,,ENSMUST00000043141,ENSMUSG00000059854,Q80W93.ENSMUSP00000046204
10090,M.musculus,KIDNEY,True,ENSMUSP00000137123,RYR1_MOUSE,E9PZQ0,Ryanodine receptor 1 (RYR-1) (RyR1) (Skeletal ...,['Ryr1'],MGDGGGEGEDEVQFLRTDDEVVLQCSATVLKEQLKLCLAAEGFGNR...,NOG047076,,ENSMUST00000179893,ENSMUSG00000030592,E9PZQ0.ENSMUSP00000137123
10090,M.musculus,KIDNEY,True,ENSMUSP00000092571,E9Q0T8_MOUSE,E9Q0T8,Dynein axonemal heavy chain 7 (Axonemal beta d...,"['Dnah7a', 'Dnahc7a', 'Dnahc7b']",MSSKKDKLSTKGKSKTPVRFLPQLPMDKLSSKEKVKLPTTVLPQLS...,NOG040644,,ENSMUST00000094964,ENSMUSG00000096141,E9Q0T8.ENSMUSP00000092571
10090,M.musculus,KIDNEY,True,ENSMUSP00000140430,A0A087WR13_MOUSE,A0A087WR13,Dynein axonemal heavy chain 7 (Axonemal beta d...,['Dnah7c'],MSSKKDKLSTKGKSKMPARFLPQLPMDKLSSKEKAKLPTTVLPQLT...,NOG040644,,ENSMUST00000189749,ENSMUSG00000101337,A0A087WR13.ENSMUSP00000140430


In [None]:
# see if we can find the embeddings for the target_ids in the HDF5 file
string_external_ids = target_ids['string_external_id'].tolist()
filename = 'data/protein.network.embeddings.v12.0.h5'
embeddings = {}

with h5py.File(filename, 'r') as f:
    species_group = f['species']
    for species_id in species_group:
        group = species_group[species_id]
        if 'proteins' in group and 'embeddings' in group:
            protein_ids = group['proteins'][:]
            protein_ids = [pid.decode('utf-8') for pid in protein_ids]
            embeddings_in_species = {}
            intersection = set(string_external_ids).intersection(protein_ids)
            if intersection:
                print(f"Species {species_id} has {len(intersection)} matching proteins.")
                # Now you can extract embeddings for these IDs if you want
                for pid in intersection:
                    idx = protein_ids.index(pid)
                    embedding = group['embeddings'][idx]
                    embeddings[pid] = embedding
                
        
            


In [49]:
test_data =  pd.read_csv("/Users/morgensh/Documents/UZH/Roche PMDA/protein-abundance-PMDA2025/test_masked_integrated_rows_M.musculus_KIDNEY.csv", index_col=0)
test_uniprots = test_data['UniprotAccession'] 

#save csv file of just test_uniprots
test_uniprots.to_csv("test_uniprots.csv", index=False) 

## MLP Model on Full Embeddings Feature List (+ Tissue 1-Hot Encoding)
- Concatenate full set of available features (STRING_ID + 1024 Seq Embeddings + 512 PPI Embeddings)
- Train MLP Regressor
- Maybe also train XGBoost regressor

In [50]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error


In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [59]:

# Load and align embeddings and abundance
embeddings_train = pd.read_csv("data/train_set2_embeddings.csv", index_col=0)
abundance_train = embeddings_train["abundance"] 
embeddings_train = embeddings_train.drop(columns=['abundance'])  # Remove abundance column for features



In [60]:

# Load and align embeddings and abundance
embeddings_test = pd.read_csv("data/test_set2_embeddings.csv", index_col=0)
abundance_test = embeddings_test["abundance"] 
embeddings_test = embeddings_test.drop(columns=['abundance'])  # Remove abundance column for features



In [61]:

# Load and align embeddings and abundance
embeddings_val = pd.read_csv("data/val_set2_embeddings.csv", index_col=0)
abundance_val = embeddings_val["abundance"] 
embeddings_val = embeddings_val.drop(columns=['abundance'])  # Remove abundance column for features



In [65]:
import ast
# def parse_embeddings(col):
#     return np.array([ast.literal_eval(x) if isinstance(x, str) else x for x in col])

# if 'embeddings_sequence' in df_train.columns:
#     df_train['embeddings_sequence'] = parse_embeddings(df_train['embeddings_sequence'])
#     df_val['embeddings_sequence'] = parse_embeddings(df_val['embeddings_sequence'])
#     df_test['embeddings_sequence'] = parse_embeddings(df_test['embeddings_sequence'])

embeddings_train['embeddings_sequence'] = embeddings_train['embeddings_sequence'].apply(lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x)

In [66]:
embeddings_test['embeddings_sequence'] = embeddings_test['embeddings_sequence'].apply(lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x)
embeddings_val['embeddings_sequence'] = embeddings_val['embeddings_sequence'].apply(lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x)

In [91]:
embeddings_train_unstacked = pd.DataFrame(np.vstack(embeddings_train['embeddings_sequence'].values))
embeddings_test_unstacked = pd.DataFrame(np.vstack(embeddings_test['embeddings_sequence'].values))
embeddings_val_unstacked = pd.DataFrame(np.vstack(embeddings_val['embeddings_sequence'].values))

new_col_names = [
    f"seq_{col}" if isinstance(col, int) or (isinstance(col, str) and col.isdigit()) else col
    for col in embeddings_train_unstacked.columns
]
# embeddings_test_unstacked = [
#     f"seq_{col}" if isinstance(col, int) or (isinstance(col, str) and col.isdigit()) else col
#     for col in embeddings_train_unstacked.columns
# ]
# embeddings_val_unstacked = [
#     f"seq_{col}" if isinstance(col, int) or (isinstance(col, str) and col.isdigit()) else col
#     for col in embeddings_train_unstacked.columns
# ]


In [None]:
embeddings_train_unstacked.columns = new_col_names
embeddings_train_unstacked.head()

embeddings_train_unstacked = embeddings_train_unstacked.drop(columns=['embeddings_sequence']) +


Unnamed: 0,seq_0,seq_1,seq_2,seq_3,seq_4,seq_5,seq_6,seq_7,seq_8,seq_9,...,seq_1014,seq_1015,seq_1016,seq_1017,seq_1018,seq_1019,seq_1020,seq_1021,seq_1022,seq_1023
0,0.009796,-0.01503,-0.008621,-0.023895,-0.050934,0.074463,0.009186,-0.021667,-0.003717,0.042511,...,-0.010559,-0.048859,0.008644,-0.036377,-0.044678,-0.048096,-0.026413,0.011795,0.006401,0.083374
1,0.009796,-0.01503,-0.008621,-0.023895,-0.050934,0.074463,0.009186,-0.021667,-0.003717,0.042511,...,-0.010559,-0.048859,0.008644,-0.036377,-0.044678,-0.048096,-0.026413,0.011795,0.006401,0.083374
2,0.009796,-0.01503,-0.008621,-0.023895,-0.050934,0.074463,0.009186,-0.021667,-0.003717,0.042511,...,-0.010559,-0.048859,0.008644,-0.036377,-0.044678,-0.048096,-0.026413,0.011795,0.006401,0.083374
3,0.009796,-0.01503,-0.008621,-0.023895,-0.050934,0.074463,0.009186,-0.021667,-0.003717,0.042511,...,-0.010559,-0.048859,0.008644,-0.036377,-0.044678,-0.048096,-0.026413,0.011795,0.006401,0.083374
4,0.009796,-0.01503,-0.008621,-0.023895,-0.050934,0.074463,0.009186,-0.021667,-0.003717,0.042511,...,-0.010559,-0.048859,0.008644,-0.036377,-0.044678,-0.048096,-0.026413,0.011795,0.006401,0.083374


In [None]:
# new data already with the embeddings expanded, so we can directly concatenate
embeddings_train_final = pd.read_csv("data/train_set3_embeddings.csv", index_col=0)
embeddings_test_final = pd.read_csv("data/test_set3_embeddings.csv", index_col=0)
embeddings_val_final = pd.read_csv("data/val_set3_embeddings.csv", index_col=0)

In [135]:
# len(embeddings_train_final[,].columns)
embeddings_train_final.iloc[:,0:21].head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,organism_id,organism_name,sample_organ,is_integrated,quality_score,coverage,string_external_id,EnsemblProteinID,...,UniprotAccession,GeneSymbol,Sequence,nog_id,abundance,EnsemblTranscript,EnsemblGene,integrated_score,non_integrated_score,Sequence_len
0,1,94,9606,H.sapiens,GALLBLADDER,True,26.8,58.0,9606.ENSP00000363412,ENSP00000363412,...,A0A024R161,['DNAJC25-GNG10'],MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...,NOG038474,13.4,ENST00000374294,ENSG00000244115,26.8,,153
1,2,96,9606,H.sapiens,HEART,True,33.6,68.0,9606.ENSP00000363412,ENSP00000363412,...,A0A024R161,['DNAJC25-GNG10'],MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...,NOG038474,1.84,ENST00000374294,ENSG00000244115,33.6,,153
2,4,98,9606,H.sapiens,LIVER,True,30.0,80.0,9606.ENSP00000363412,ENSP00000363412,...,A0A024R161,['DNAJC25-GNG10'],MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...,NOG038474,5.72,ENST00000374294,ENSG00000244115,30.0,,153
3,5,99,9606,H.sapiens,LUNG,True,30.3,67.0,9606.ENSP00000363412,ENSP00000363412,...,A0A024R161,['DNAJC25-GNG10'],MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...,NOG038474,3.87,ENST00000374294,ENSG00000244115,30.3,,153
4,6,102,9606,H.sapiens,PANCREAS,True,32.0,65.0,9606.ENSP00000363412,ENSP00000363412,...,A0A024R161,['DNAJC25-GNG10'],MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...,NOG038474,2.59,ENST00000374294,ENSG00000244115,32.0,,153


In [136]:
embeddings_train_final.drop(columns=embeddings_train_final.columns[0:21], axis=1, inplace=True)
embeddings_train_final.head()

Unnamed: 0,organ_GALLBLADDER,organ_HEART,organ_KIDNEY,organ_LIVER,organ_LUNG,organ_PANCREAS,organ_WHOLE_ORGANISM,organism_B.taurus,organism_D.rerio,organism_G.gallus,...,net_502,net_503,net_504,net_505,net_506,net_507,net_508,net_509,net_510,net_511
0,True,False,False,False,False,False,False,False,False,False,...,-0.028259,0.021286,0.003853,-0.025406,0.041656,0.079163,-0.065613,-0.009056,-0.018387,0.039093
1,False,True,False,False,False,False,False,False,False,False,...,-0.028259,0.021286,0.003853,-0.025406,0.041656,0.079163,-0.065613,-0.009056,-0.018387,0.039093
2,False,False,False,True,False,False,False,False,False,False,...,-0.028259,0.021286,0.003853,-0.025406,0.041656,0.079163,-0.065613,-0.009056,-0.018387,0.039093
3,False,False,False,False,True,False,False,False,False,False,...,-0.028259,0.021286,0.003853,-0.025406,0.041656,0.079163,-0.065613,-0.009056,-0.018387,0.039093
4,False,False,False,False,False,True,False,False,False,False,...,-0.028259,0.021286,0.003853,-0.025406,0.041656,0.079163,-0.065613,-0.009056,-0.018387,0.039093


In [137]:
embeddings_test_final.drop(columns=embeddings_test_final.columns[0:21], axis=1, inplace=True)
embeddings_test_final.head()

Unnamed: 0,organ_GALLBLADDER,organ_HEART,organ_KIDNEY,organ_LIVER,organ_LUNG,organ_PANCREAS,organ_WHOLE_ORGANISM,organism_B.taurus,organism_D.rerio,organism_G.gallus,...,net_502,net_503,net_504,net_505,net_506,net_507,net_508,net_509,net_510,net_511
0,False,False,False,True,False,False,False,False,False,False,...,0.051636,0.062378,-0.052032,-0.011879,0.10553,0.048096,-0.07843,-0.00304,-0.024445,0.027908
1,False,False,False,True,False,False,False,False,False,False,...,0.017151,-0.005741,0.000923,-0.024506,0.040039,0.030411,-0.081848,0.015594,-0.021957,0.033508
2,False,False,False,True,False,False,False,False,False,False,...,-0.015717,0.006123,-0.014595,-0.003389,0.081299,-0.018845,-0.00304,0.01738,-0.000601,-0.011719
3,False,False,False,True,False,False,False,False,False,False,...,0.00642,0.023483,-0.017136,-0.021912,0.053223,-0.006863,0.005585,-0.02565,0.008209,-0.027817
4,False,False,False,True,False,False,False,False,False,False,...,0.075867,0.054413,-0.063599,-0.000223,0.082458,0.053314,-0.014427,0.004932,-0.011055,0.028275


In [140]:
embeddings_val_final.drop(columns=embeddings_val_final.columns[0:21], axis=1, inplace=True)
embeddings_val_final.head()

Unnamed: 0,organ_GALLBLADDER,organ_HEART,organ_KIDNEY,organ_LIVER,organ_LUNG,organ_PANCREAS,organ_WHOLE_ORGANISM,organism_B.taurus,organism_D.rerio,organism_G.gallus,...,net_502,net_503,net_504,net_505,net_506,net_507,net_508,net_509,net_510,net_511
0,False,False,True,False,False,False,False,False,False,False,...,-0.028259,0.021286,0.003853,-0.025406,0.041656,0.079163,-0.065613,-0.009056,-0.018387,0.039093
1,False,False,True,False,False,False,False,False,False,False,...,0.019638,-0.014954,0.070679,0.010727,0.069031,-0.141479,-0.037537,0.001512,0.00481,-0.042969
2,False,False,True,False,False,False,False,False,False,False,...,0.036774,0.002348,-0.035645,0.006664,0.076782,-0.005863,-0.081299,0.008308,-0.00084,0.08075
3,False,False,True,False,False,False,False,False,False,False,...,-0.010559,0.003206,-0.052094,-0.008759,0.050079,-0.022324,-0.072144,0.002024,0.002922,0.106445
4,False,False,True,False,False,False,False,False,False,False,...,-0.002836,0.015396,-0.024399,0.003286,0.0401,0.090271,0.009529,-0.011414,-0.000173,0.032166


In [141]:
embeddings_train_final.to_csv("data/embeddings_train_final.csv")
embeddings_test_final.to_csv("data/embeddings_test_final.csv")
embeddings_val_final.to_csv("data/embeddings_val_final.csv")

In [142]:
abundance_train.to_csv("data/abundance_train_final.csv")
abundance_test.to_csv("data/abundance_test_final.csv")
abundance_val.to_csv("data/abundance_val_final.csv")

In [None]:
# Train/test split
X_train, X_test = embeddings_train_final, embeddings_test_final
y_train, y_test = np.log1p(abundance_train).values.squeeze(), np.log1p(abundance_test).values.squeeze()


In [114]:
y_train.shape, y_test.shape

((142355,), (5810,))

In [152]:
input_layer_size = len(embeddings_train_final.columns)  # Exclude 'species_id' and 'protein_id'

mlp1 = MLPRegressor(
    hidden_layer_sizes=(input_layer_size, 1024, 512), # 256, 64 if time permits
    activation='relu',
    alpha=1e-4,
    solver='adam',
    max_iter=200,
    early_stopping=True,
    random_state=42,
    verbose=True
)

mlp2 = MLPRegressor(
    hidden_layer_sizes=(input_layer_size, 1024, 512, 256), 
    activation='relu',
    alpha=1e-4,
    solver='adam',
    max_iter=200,
    early_stopping=True,
    random_state=42,
    verbose=True
)

mlp3 = MLPRegressor(
    hidden_layer_sizes=(input_layer_size, 1024, 512, 256, 64),
    activation='relu',
    alpha=1e-4,
    solver='adam',
    max_iter=200,
    early_stopping=True,
    random_state=42,
    verbose=True
)


In [None]:
mlp1.fit(X_train, y_train)
y_pred_mlp1 = mlp1.predict(X_test)

rmse_mlp1 = np.sqrt(mean_squared_error(y_test, y_pred_mlp1))
print(f"MLP1 RMSE: {rmse_mlp1:.4f}")

mlp2.fit(X_train, y_train)
y_pred_mlp2 = mlp2.predict(X_test)

rmse_mlp2 = np.sqrt(mean_squared_error(y_test, y_pred_mlp2))
print(f"MLP2 RMSE: {rmse_mlp2:.4f}")

mlp3.fit(X_train, y_train)
y_pred_mlp3 = mlp3.predict(X_test)

rmse_mlp3 = np.sqrt(mean_squared_error(y_test, y_pred_mlp3))
print(f"MLP3 RMSE: {rmse_mlp3:.4f}")



MLP1 RMSE: 3.3678
Iteration 1, loss = 1.85236065
Validation score: 0.038569
Iteration 2, loss = 1.83510981
Validation score: 0.037944
Iteration 3, loss = 1.83264249
Validation score: 0.018890
Iteration 4, loss = 1.83017877
Validation score: 0.027725
Iteration 5, loss = 1.83048859
Validation score: 0.040396
Iteration 6, loss = 1.83082204
Validation score: 0.040737
Iteration 7, loss = 1.82866842
Validation score: 0.040301
Iteration 8, loss = 1.82955620
Validation score: 0.040784
Iteration 9, loss = 1.82885653
Validation score: 0.039715
Iteration 10, loss = 1.82845029
Validation score: 0.041070
Iteration 11, loss = 1.82798866
Validation score: 0.035957
Iteration 12, loss = 1.82792199
Validation score: 0.040427
Iteration 13, loss = 1.82799512
Validation score: 0.040235
Iteration 14, loss = 1.82760245
Validation score: 0.040863
Iteration 15, loss = 1.82744316
Validation score: 0.040660
Iteration 16, loss = 1.82796680
Validation score: 0.041492
Iteration 17, loss = 1.82810381
Validation scor