# Notebook setup

In [None]:
import os

os.environ['AWS_ACCESS_KEY_ID'] = 'AWS_ACCESS_KEY_ID'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'AWS_SECRET_ACCESS_KEY'


In [None]:
import lithops
from work.data_cockpit.widget import DataLoaderWidget

lithops.__version__
os.chdir('/work')

In [None]:
data_loader = DataLoaderWidget()

data_loader.display()


In [None]:
data_slices = data_loader.get_data_slices()

## Choose database

In [None]:
import json
input_db = json.load(open('metabolomics/db_config1.json'))

## a) Use a default dataset

In [None]:
input_ds = json.load(open('metabolomics/ds_config1.json'))

## b) Choose a dataset from METASPACE instead

In [None]:
#from datasetpicker import MetaspaceBrowser, get_dataset_links, get_dataset_metadata
#browser = MetaspaceBrowser()
#browser

In [None]:
#browser.datasetId

In [None]:
#links = await get_dataset_links(browser.datasetId)
#metadata = await get_dataset_metadata(browser.datasetId)

In [None]:
#input_ds = {
#  "name": metadata["name"],
#  "imzml_path": links[0],
#  "ibd_path": links[1],
#  "num_decoys": 20,
#  "polarity": "+" if metadata["polarity"] == "POSITIVE" else "-",
#  "isocalc_sigma": 0.000693,
#  "metaspace_id": browser.datasetId
#}

# Setup

In [None]:
from annotation_pipeline.pipeline import Pipeline

# Process database and pre-calculate centroids (not benchmarked because usually this step is cached)
pipeline = Pipeline(input_ds, input_db, use_db_cache=True, use_ds_cache=False, data_slices=data_slices)
pipeline(task='db')

# Run annotation pipeline

In [None]:
from datetime import datetime
start_time = datetime.now()
pipeline(task='ds')
results_df = pipeline.get_results()
finish_time = datetime.now()

In [None]:
print('start', start_time)
print('finish', finish_time)
print('duration', finish_time - start_time)

In [None]:
# Display statistics file
from annotation_pipeline.utils import PipelineStats
PipelineStats.get()

# Display annotations

In [None]:
# Display most annotated molecules statistics
results_df = pipeline.get_results()
top_mols = (results_df
               .sort_values('msm', ascending=False)
               .drop('database_path', axis=1)
               .drop_duplicates(['mol','modifier','adduct']))
top_mols.head()

In [None]:
# Download annotated molecules images
formula_images = pipeline.get_images(as_png=False)

In [None]:
# Display most annotated molecules images
import matplotlib.pyplot as plt
for i, (formula_i, row) in enumerate(top_mols.head().iterrows()):
    plt.figure(i)
    plt.title(f'{row.mol}{row.modifier}{row.adduct} - MSM {row.msm:.3f} FDR {row.fdr*100:.0f}%')
    plt.imshow(formula_images[formula_i][0].toarray())

In [None]:
pipeline.clean()