In [1]:
import pandas as pd
import numpy as np 
import pyarrow.parquet as pq

In [4]:
# file_path = '/protein-abundance-PMDA2025/all_organisms_filtered_without_M.musculus_KIDNEY.parquet'
file_path = '/Users/morgensh/Documents/UZH/Roche PMDA/protein-abundance-PMDA2025/all_organisms_filtered_without_M.musculus_KIDNEY.parquet'
#schema = pq.read_schema(file_path)
#print(schema.names)
df = pd.read_parquet(file_path)


In [5]:
df['integrated_score'] = np.where(df['is_integrated'] == True, df['quality_score'], np.nan)
df['non_integrated_score'] = np.where(df['is_integrated'] == False, df['quality_score'], np.nan)

species_breakdown_updated = df.groupby('organism_name').agg(
    total_protein_entries=('EnsemblProteinID', 'count'),
    unique_proteins_name=('UniprotEntryName', 'nunique'),
    unique_proteins_id=('UniprotAccession', 'nunique'),
    avg_quality_score=('quality_score', 'mean'),
    avg_integrated_quality=('integrated_score', 'mean'),
    avg_non_integrated_quality=('non_integrated_score', 'mean')
)

species_breakdown_updated = species_breakdown_updated.sort_values(by='total_protein_entries', ascending=False)
score_cols = ['avg_quality_score', 'avg_integrated_quality', 'avg_non_integrated_quality']
species_breakdown_updated[score_cols] = species_breakdown_updated[score_cols].round(2)



In [6]:
print(species_breakdown_updated.head(5))

               total_protein_entries  unique_proteins_name  \
organism_name                                                
H.sapiens                    1797533                 19047   
M.musculus                    489864                 19439   
A.thaliana                    426081                 21866   
Taestivum                     247308                 38372   
Mmulatta                      118524                 12146   

               unique_proteins_id  avg_quality_score  avg_integrated_quality  \
organism_name                                                                  
H.sapiens                   18673              19.97                   29.83   
M.musculus                  19117              17.39                   25.40   
A.thaliana                  21763              11.64                   16.54   
Taestivum                   38372              17.27                   22.55   
Mmulatta                    12146               7.82                   10.11   

    

In [9]:
colnames = df.columns.tolist()
colnames

['dataset_id',
 'dataset_name',
 'dataset_description',
 'organism_id',
 'organism_name',
 'sample_organ',
 'is_integrated',
 'quality_score',
 'coverage',
 'publication_link',
 'publication_year',
 'original_filename',
 'string_external_id',
 'EnsemblProteinID',
 'mapped_from',
 'UniprotEntryName',
 'UniprotAccession',
 'ProteinName',
 'GeneSymbol',
 'Sequence',
 'uniprot_status',
 'nog_id',
 'abundance',
 'EnsemblTranscript',
 'EnsemblGene',
 'integrated_score',
 'non_integrated_score']

# Protein Network Embeddings --> Abundance Prediction
We will use the SPACE network embeddings of ortholog info and protein-protein interaction (embedding dim 512)
We wil then train a inference head (regression?) to predict abundance on whole-organism data



In [15]:
import h5py

filename = '/Users/morgensh/Documents/UZH/Roche PMDA/protein-abundance-PMDA2025/data/protein.network.embeddings.v12.0.h5'


In [16]:
# filepath: /Users/morgensh/Documents/UZH/Roche PMDA/protein-abundance-PMDA2025/PPI_to_abundance.ipynb
with h5py.File(filename, 'r') as f:
    print("Root keys:", list(f.keys()))
    meta_keys = f['metadata'].attrs.keys()
    for key in meta_keys:
        print(key, f['metadata'].attrs[key])


Root keys: ['metadata', 'species']
embedding_dim 512
n_species 1322
precision 16
total_proteins 21212112


In [None]:
def print_all_datasets(name, obj):
    if isinstance(obj, h5py.Dataset):
        print(name)

with h5py.File(filename, 'r') as f:
    f.visititems(print_all_datasets)
    

species/1001832/embeddings
species/1001832/proteins
species/1001833/embeddings
species/1001833/proteins
species/1001938/embeddings
species/1001938/proteins
species/10020/embeddings
species/10020/proteins
species/10029/embeddings
species/10029/proteins
species/1003232/embeddings
species/1003232/proteins
species/10036/embeddings
species/10036/proteins
species/10047/embeddings
species/10047/proteins
species/100787/embeddings
species/100787/proteins
species/100816/embeddings
species/100816/proteins
species/10089/embeddings
species/10089/proteins
species/10090/embeddings
species/10090/proteins
species/10093/embeddings
species/10093/proteins
species/10096/embeddings
species/10096/proteins
species/10103/embeddings
species/10103/proteins
species/101091/embeddings
species/101091/proteins
species/101127/embeddings
species/101127/proteins
species/10116/embeddings
species/10116/proteins
species/10141/embeddings
species/10141/proteins
species/10160/embeddings
species/10160/proteins
species/1016849/

Okay, indeed all the files are names after proteins or embeddings
Their structure is:
```
species/1001832/embeddings
species/1001832/proteins
species/1001833/embeddings
species/1001833/proteins
...
```

Dataset facts:
- Root keys: ['metadata', 'species']
- embedding_dim 512
- n_species 1322
- precision 16
- total_proteins 21212112

The number of proteins is confirmed, and they are listed as STRING IDs:
`Total protein IDs: 21212112
['1001832.A0A1Y2WP95', '1001832.A0A1Y2WPA2', '1001832.A0A1Y2WLE4', '1001832.A0A1Y2X4S7', '1001832.A0A1Y2WY14', '1001832.A0A1Y2WUY6', '1001832.A0A1Y2XC72', '1001832.A0A1Y2WT67', '1001832.A0A1Y2X149', '1001832.A0A1Y2X2R6']`

In [None]:
all_protein_ids = []

with h5py.File(filename, 'r') as f:
    def collect_proteins(name, obj):
        if isinstance(obj, h5py.Dataset) and name.endswith('/proteins'):
            protein_ids = obj[:]
            # If stored as bytes, decode to strings
            protein_ids = [pid.decode('utf-8') for pid in protein_ids]
            all_protein_ids.extend(protein_ids)
    f.visititems(collect_proteins)

print(f"Total protein IDs: {len(all_protein_ids)}")
print(all_protein_ids[:10])  # Print first 10 as a check

Total protein IDs: 21212112
['1001832.A0A1Y2WP95', '1001832.A0A1Y2WPA2', '1001832.A0A1Y2WLE4', '1001832.A0A1Y2X4S7', '1001832.A0A1Y2WY14', '1001832.A0A1Y2WUY6', '1001832.A0A1Y2XC72', '1001832.A0A1Y2WT67', '1001832.A0A1Y2X149', '1001832.A0A1Y2X2R6']


_______________________________________

# Scoring PP Interactions for Proxy Abundance
The STRING database compiles, scores and integrates protein–protein association information drawn from experimental assays, computational predictions and prior knowledge


### Co-expression score
Based on:
Example values:
How to interpret:


Idea: comparison to abundance data
- STRING gene expression profile? (TODO: double check)
- Find example mapping 