In [4]:
import os
from tqdm import tqdm
import pickle
import random
from typing import List, Dict, Tuple
from anndata import AnnData
import time

import pandas as pd
import numpy as np
import math
import scipy
from sklearn.metrics.pairwise import pairwise_kernels
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels

from molmass import Formula
import metaspace
import linex2metaspace as lx2m
from scipy.cluster.hierarchy import linkage
from sklearn.cluster import AgglomerativeClustering
from scipy import stats
from statannotations.Annotator import Annotator
import networkx as nx

In [2]:
import utils
from coloc_utils import *
from config import store_dir, data_dir, date_key, enrichment_dir
%load_ext autoreload
%autoreload 2

# Load datasets

In [5]:
pos_lip_top_datasets = pickle.load(open(os.path.join(store_dir, 'pos_lip_top_datasets_list.pickle'), "rb" ))
tissue_ads = load_alltissue_datasets(pos_lip_top_datasets)

# exclude datasets
del tissue_ads['Brain']['2022-08-24_00h20m06s']
del tissue_ads['Brain']['2022-08-23_23h48m59s']
del tissue_ads['Buccal mucosa']

# Calculate mass of every molecule
for tis, dsd in tissue_ads.items():
    for dsid, ds in dsd.items():
        tissue_ads[tis][dsid].var['mass'] = [Formula(x).mass for x in tissue_ads[tis][dsid].var['formula'].values]

# Filter by mass
for tis, dsd in tissue_ads.items():
    for dsid, ds in dsd.items():
        tmp = tissue_ads[tis][dsid]
        tmp = tmp[:, tmp.var['mass'] <= 900]
        tissue_ads[tis][dsid] = tmp[:, tmp.var['mass'] >= 400]

# Remove isobars
tissue_ads = mark_isobars(tissue_ads, ppm_threshold=3)
ll = []
for tis, dsd in tissue_ads.items():
    for dsid, ds in dsd.items():
        if ds[:, ~ds.var['has_isobar']].shape[1] < 10:
            ll.append((tis, dsid))
        else:
            # Remove too big datasets
            if ds.shape[0] > 100000:
                ll.append((tis, dsid))
for item in ll:
    del tissue_ads[item[0]][item[1]]

  tissue_ads[tis][dsid].var['mass'] = [Formula(x).mass for x in tissue_ads[tis][dsid].var['formula'].values]
  tissue_ads[tis][dsid].var['mass'] = [Formula(x).mass for x in tissue_ads[tis][dsid].var['formula'].values]
  tissue_ads[tis][dsid].var['mass'] = [Formula(x).mass for x in tissue_ads[tis][dsid].var['formula'].values]
  tissue_ads[tis][dsid].var['mass'] = [Formula(x).mass for x in tissue_ads[tis][dsid].var['formula'].values]
  tissue_ads[tis][dsid].var['mass'] = [Formula(x).mass for x in tissue_ads[tis][dsid].var['formula'].values]
  tissue_ads[tis][dsid].var['mass'] = [Formula(x).mass for x in tissue_ads[tis][dsid].var['formula'].values]
  tissue_ads[tis][dsid].var['mass'] = [Formula(x).mass for x in tissue_ads[tis][dsid].var['formula'].values]
  tissue_ads[tis][dsid].var['mass'] = [Formula(x).mass for x in tissue_ads[tis][dsid].var['formula'].values]
  tissue_ads[tis][dsid].var['mass'] = [Formula(x).mass for x in tissue_ads[tis][dsid].var['formula'].values]
  tissue_ads[tis][d

In [6]:
for tis in tissue_ads.keys():
    print(f'** {tis} **')
    for ds in tissue_ads[tis].keys():
        tmp = tissue_ads[tis][ds].uns['metaspace_metadata']['MS_Analysis']
        if 'Pixel_Size' in tmp.keys() and tmp['Pixel_Size']['Xaxis'] <= 50 and tmp['Pixel_Size']['Yaxis'] <= 50:
            adat = tissue_ads[tis][ds]
            unique_labels = np.unique(adat.var.formula)
            sums = {}

            # Iterate over the unique labels
            for label in unique_labels:
                # Get the indices of rows with the current label
                indices = np.where(adat.var.formula == label)[0]
                # Sum up the corresponding rows and store the result
                if len(indices)>1:
                    sums[label] = np.sum(adat.X[:, indices], axis=1)
                else:
                    sums[label] = adat.X[:, indices[0]]

            tmp_array = np.stack(list(sums.values()))
            tmp_molecules = np.array(list(sums.keys()))

            tmp_ymax = adat.obs['y'].max()+1
            # Coloc preprocessing:
            conv_data = utils.coloc_preprocessing_array(tmp_array.transpose(), tmp_ymax)

            df = pd.DataFrame(conv_data.transpose(), columns=tmp_molecules)
            df['row'] = adat.obs.reset_index()['x'] * adat.uns['metaspace_metadata']['MS_Analysis']['Pixel_Size']['Xaxis']
            df['col'] = adat.obs.reset_index()['y'] * adat.uns['metaspace_metadata']['MS_Analysis']['Pixel_Size']['Yaxis']

            df = df.loc[~(df.drop(columns=['row', 'col']).sum(axis=1) == 0), :]
            df = df.loc[:, df.sum()!=0]
            df.to_csv(f'/scratch/trose/pos_lip_formisty/{tis}_{ds}.csv')

** Kidney **
** Brain **
** Lung **
** Ovary **
** Epididymis **
** Liver **


In [98]:
adat = tissue_ads['Brain']['2022-07-19_19h29m24s']
unique_labels = np.unique(adat.var.formula)
sums = {}

# Iterate over the unique labels
for label in unique_labels:
    # Get the indices of rows with the current label
    indices = np.where(adat.var.formula == label)[0]
    # Sum up the corresponding rows and store the result
    if len(indices)>1:
        sums[label] = np.sum(adat.X[:, indices], axis=1)
    else:
        sums[label] = adat.X[:, indices[0]]

tmp_array = np.stack(list(sums.values()))
tmp_molecules = np.array(list(sums.keys()))

tmp_ymax = adat.obs['y'].max()+1
# Coloc preprocessing:
conv_data = utils.coloc_preprocessing_array(tmp_array.transpose(), tmp_ymax)

df = pd.DataFrame(conv_data.transpose(), columns=tmp_molecules)
df['row'] = adat.obs.reset_index()['x'] * adat.uns['metaspace_metadata']['MS_Analysis']['Pixel_Size']['Xaxis']
df['col'] = adat.obs.reset_index()['y'] * adat.uns['metaspace_metadata']['MS_Analysis']['Pixel_Size']['Yaxis']
df = df.loc[~(df.drop(columns=['row', 'col']).sum(axis=1) == 0), :]
df = df.loc[:, (df > 0).sum() >= 20]


In [95]:
(df > 0).sum()

C14H26N4O11P2     4552
C15H17N5O6S2     11297
C21H14O10        11600
C21H41O7P         5490
C22H42O4          4659
                 ...  
C50H96NO8P        5118
C50H98NO8P        5645
C62H114N2O23      9397
row              15724
col              15724
Length: 138, dtype: int64

In [88]:
df.sum()==0

C14H26N4O11P2    False
C15H17N5O6S2     False
C21H14O10        False
C21H41O7P        False
C22H42O4         False
                 ...  
C50H98NO8P       False
C62H114N2O23     False
C9H16N2O5S2       True
row              False
col              False
Length: 139, dtype: bool

In [99]:
df

Unnamed: 0,C14H26N4O11P2,C15H17N5O6S2,C21H14O10,C21H41O7P,C22H42O4,C23H43O7P,C23H45O7P,C23H46NO7P,C24H50NO7P,C25H50NO7P,...,C48H91NO8,C48H92NO8P,C48H93NO8,C48H94NO8P,C50H92N2O18,C50H96NO8P,C50H98NO8P,C62H114N2O23,row,col
1039,0.0,0.000000,1.472328,0.0,0.0,0.0,0.0,0.0,0.475232,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9900,400
1040,0.0,1.331555,1.533674,0.0,0.0,0.0,0.0,0.0,0.523081,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000,400
1041,0.0,1.331555,1.472328,0.0,0.0,0.0,0.0,0.0,0.475232,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10100,400
1042,0.0,0.000000,1.496866,0.0,0.0,0.0,0.0,0.0,0.638454,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10200,400
1043,0.0,0.000000,0.638009,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10300,400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23698,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.110824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19800,10000
23699,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.431074,0.0,...,0.079870,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19900,10000
23700,0.0,1.306192,1.386442,0.0,0.0,0.0,0.0,0.0,0.727594,0.0,...,0.079870,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20000,10000
23701,0.0,0.000000,0.858858,0.0,0.0,0.0,0.0,0.0,0.648429,0.0,...,0.079870,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20100,10000
