*Experimental*
__________

## Example Intesity Pipeline with PTMs using a PROSPECT POOL (Json Input) 


### Get the dataset from PROSPECT

In [None]:
# develop branch of PROSPECT
!pip install git+https://github.com/wilhelm-lab/PROSPECT.git@develop

In [None]:
!pip install git+https://github.com/wilhelm-lab/dlomix.git@develop

In [None]:
import prospectdataset as prospect 
data_dir = "./data"
pool_keyword = "third"
prospect.download_dataset("all", data_dir, pool_keyword)

In [None]:
import glob
import os
from pathlib import Path

# pick the path of the metadata file, can also be simply copied and pasted from previous cell outout 
#meta_data_filepath = './data/TUM_third_pool_meta_data.parquet'

meta_data_filepath = glob.glob(os.path.join(data_dir, "*meta_data*.parquet"))[0]
meta_data_filepath

In [None]:
# annotation file names and paths

pool_folder_path = os.path.splitext(glob.glob(os.path.join(data_dir, "*.zip"))[0])[0]

annotations_filepaths = glob.glob(os.path.join(pool_folder_path, "*.parquet"))
annotations_names = [Path(f).stem for f in annotations_filepath]

annotations_names, annotations_filepaths

### Prepare input data dict for DLOmix Dataset Class (JSON mode)

In [None]:


input_data_dict = {
    "metadata": meta_data_filepath,
    "annotations": {
        pool_keyword: dict(zip(annotations_names, annotations_filepaths))
    },
    "parameters": {
        "target_column_key": "intensities_raw"
    }
}

input_data_dict

In [None]:
# later we can feed the dict directly as a data source, for now we stick to json format

import json
with open("input_config.json", 'w') as fp:
    json.dump(input_data_dict, fp)

### Create Intensity Dataset from the downloaded Pool using features and parser
This would take a couple of minutes since it:
- reads the metadata and the annotation files
- does some filtering and wrangling of the data
- produces the final input data for intensity
- extracts the features and prepares the TensorFlow Datasets

In [None]:
from dlomix.data import IntensityDataset
from dlomix.data.feature_extractors import (
    ModificationGainFeature,
    ModificationLocationFeature,
    ModificationLossFeature,
)

BATCH_SIZE = 128
SEQ_LENGTH = 30

int_data = IntensityDataset(
    data_source="input_config.json",
    seq_length=SEQ_LENGTH,
    batch_size=BATCH_SIZE,
    val_ratio=0.15,
    precursor_charge_col="precursor_charge_onehot",
    sequence_col="modified_sequence",
    collision_energy_col="collision_energy_aligned_normed",
    intensities_col="intensities_raw",
    features_to_extract=[
        ModificationLocationFeature(),
        ModificationLossFeature(),
        ModificationGainFeature(),
    ],
    parser="proforma",
)

In [None]:
"Training examples", BATCH_SIZE * len(int_data.train_data)

In [None]:
"Validation examples", BATCH_SIZE * len(int_data.val_data)

### Create Model and compile it with the respective loss

In [None]:
import tensorflow as tf
from dlomix.models import PrositIntensityPredictor
from dlomix.losses import masked_spectral_distance

model = PrositIntensityPredictor(seq_length=30, use_ptm_counts=True)

# create the optimizer object
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# compile the model  with the optimizer and the metrics we want to use, we can add our custom metric
model.compile(optimizer=optimizer, loss=masked_spectral_distance, metrics=["mae"])


### Train model

In [None]:
history = model.fit(int_data.train_data,
                    validation_data=int_data.val_data,
                    epochs=2)
