# Download of METASPACE data for analysis

In [1]:
import os
from tqdm import tqdm
import pickle
import random

import pandas as pd
import numpy as np
import scipy
from sklearn.metrics.pairwise import pairwise_kernels
import matplotlib.pyplot as plt

from metaspace import SMInstance
from anndata import AnnData
from metaspace2anndata import dataset_to_anndata

## Date key

Set the date key and directory to download the results

In [2]:
from config import date_key, data_dir, store_dir
import utils
%load_ext autoreload
%autoreload 2

## Initialize METASPACE

In [3]:
sm = SMInstance()

## Download all dataset objects

In [14]:
dss = sm.datasets()

In [19]:
pickle.dump(dss, 
            open( os.path.join(store_dir, 'all_datasets.pickle'), "wb" ) )

In [4]:
#dss = pickle.load(open(os.path.join(store_dir, 'all_datasets.pickle'), "rb" ) )

## Download all results tables for HMDB

In [31]:
os.listdir(store_dir)

['all_datasets.pickle']

In [None]:
database = ('HMDB', 'v4')
filename = 'hmdb4_results.pickle'

if filename in os.listdir(store_dir):
    all_results_dict = pickle.load(open(os.path.join(store_dir, filename), "rb" ) )
else:
    print('Making new dict')
    all_results_dict = {}

counter = 0

In [None]:
for ds in dss:
    if ds.status == 'FINISHED' and ds.id not in all_results_dict.keys():
        if database in [(x.name, x.version) for x in ds.database_details]:
            
            # Download annotation table
            tmp_tab = ds.results(database=database)
            
            if tmp_tab.size > 0:
                if ds.id not in all_results_dict.keys():
                    all_results_dict[ds.id] = tmp_tab[['ionFormula', 'ion', 'fdr', 'mz', 'offSample', 'moleculeNames', 'intensity', 'moleculeIds']]
                    counter +=1
            print(counter)

            # Save intermediate results    
            if (counter % 100) == 0:
                pickle.dump(all_results_dict,
                            # Temporary directory for saving intermediate progress if script crashes.
                            open( os.path.join('/scratch/trose/tmp', filename + '_' + str(counter)), "wb" ) )
                
pickle.dump(all_results_dict, 
            open( os.path.join(store_dir, filename), "wb" ) )

## Download SwissLipids Datasets

In [None]:
os.mkdir(store_dir, 'sl_coloc')
os.mkdir(store_dir, 'sl_anndata')

In [None]:
database = ('SwissLipids', '2018-02-02')

for ds in tqdm(dss):
    # Filter for datasets wit SwissLipids annotation
    if database in [(x.name, x.version) for x in ds.database_details]:
        
        if ds.id + '.pickle' not is os.listdir(os.path.join(store_dir, 'sl_anndata')) or ds.id + '.pickle' not is os.listdir(os.path.join(store_dir, 'sl_coloc')):
        
            # Download results
            res = ds.results(fdr=0.1, database=database)

            # Only consider datasets with at least 100 annotations:
            if res.shape[0] >= 100:

                # download all annotation images
                aai = ds.all_annotation_images(fdr=0.1, 
                                               database=database, 
                                               only_first_isotope=True, 
                                               scale_intensity=False, 
                                               hotspot_clipping=False)

                # Only consider images with at least 1000 pisels and 20x20 dimensions
                if (aai[0]._images[0].size >= 1000) and (aai[0]._images[0].shape[0] >= 20) and (aai[0]._images[0].shape[1] >= 20):

                    # Median filter for coloc analysis
                    ion_array = np.array([scipy.signal.medfilt2d(x._images[0], 
                                                                 kernel_size=3).flatten() 
                                          for x in aai])

                    # Save coloc in dataframe
                    coloc_df = pd.DataFrame(pairwise_kernels(ion_array, metric='cosine'), 
                                            columns = [x.formula + x.adduct for x in aai], 
                                            index=[x.formula + x.adduct for x in aai])
                    coloc_df.to_pickle(os.path.join(store_dir, 'sl_coloc', ds.id + '.pickle'))


                    # Create AnData object
                    adata = dataset_to_anndata(ds=ds,
                                               database=database,
                                               fdr=0.1,
                                               results=res,
                                               all_annotation_images=aai)

                    pickle.dump(adata, open(os.path.join(store_dir, 'sl_anndata', ds.id + '.pickle'), "wb" ))
                