# Preprocessing Code for the MERFISH Study

Prepares the MERFISH data based on the analysis by SpatialDE

Requires the SpatialDE package (or the space_met virtualenv)

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.display import display
import sys
import os
import logging

sys.path.insert(0,'../src')
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 100

In [2]:
import numpy as np
import pandas as pd

import NaiveDE
import SpatialDE

In [4]:
ddir = '/share/PI/sabatti/sc_data/spatial_ge/spatialde/SpatialDE/Analysis/MERFISH/'

In [19]:
# load the raw sge data
df = pd.read_csv(ddir+'data/rep6/middle_exp_mat.csv', index_col=0)
print(df.shape)
display(df.head())
sample_info = pd.read_csv(ddir+'data/rep6/middle_sample_info.csv', index_col=0)
display(sample_info.head())

(1056, 140)


Unnamed: 0_level_0,AFAP1,AFF4,AGAP1,AGO3,AGPS,AHDC1,AKAP11,ALPK2,AMOTL1,ANKH,...,USP34,USP8,USP9X,VPS13D,XDH,XKR5,YIPF4,ZBTB43,ZCCHC6,ZNF592
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16345,5.0,11.0,10.0,1.0,5.0,5.0,4.0,6.0,8.0,1.0,...,12.0,6.0,11.0,4.0,8.0,3.0,1.0,6.0,2.0,3.0
16347,2.0,1.0,5.0,3.0,2.0,5.0,2.0,6.0,5.0,0.0,...,12.0,1.0,8.0,3.0,3.0,1.0,2.0,0.0,2.0,5.0
16348,6.0,12.0,20.0,4.0,8.0,1.0,9.0,4.0,10.0,5.0,...,11.0,2.0,12.0,4.0,4.0,4.0,5.0,3.0,4.0,12.0
16349,6.0,11.0,11.0,9.0,9.0,10.0,14.0,17.0,9.0,5.0,...,23.0,3.0,21.0,11.0,16.0,14.0,9.0,7.0,18.0,10.0
16351,5.0,12.0,16.0,7.0,10.0,8.0,6.0,7.0,17.0,3.0,...,17.0,1.0,15.0,8.0,4.0,1.0,2.0,5.0,15.0,4.0


Unnamed: 0_level_0,abs_X,abs_Y,nucleusArea,cytoplasmArea,total_count
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16345,-438.999143,-312.475845,193.036072,1040.400991,828.0
16347,-437.428582,-334.316858,214.738917,472.263443,557.0
16348,-432.656618,-438.88666,276.365457,1083.329695,961.0
16349,-429.781788,-411.289794,292.249077,1597.377076,1745.0
16351,-412.916125,-358.068636,179.632777,887.38401,1151.0


In [20]:
df = df.T[df.sum(0) >= 3].T  # Filter practically unobserved genes # doesn't do anything
df = df.loc[sample_info.index]
df.shape

(1056, 140)

In [28]:
dfm = NaiveDE.stabilize(df.T).T
res = NaiveDE.regress_out(sample_info, dfm.T, 'np.log(total_count)').T
X = sample_info[['abs_X', 'abs_Y']] # coordinates
X.columns = ['x', 'y']
# Add total_count as pseudogene for reference
# res['log_total_count'] = np.log(sample_info['total_count'])

In [29]:
# save the matrix and the coordinates to file
data_dir = '../data/'
f_pfx = 'MERFISH'
fn_coord = data_dir + '{}_coord.csv'.format(f_pfx)
fn_expr = data_dir + '{}_expr.csv'.format(f_pfx)
res.to_csv(fn_expr)
X.to_csv(fn_coord)

In [30]:
expr = pd.read_csv(fn_expr, index_col=0)
coor = pd.read_csv(fn_coord, index_col=0)

In [31]:
coor.head()

Unnamed: 0_level_0,x,y
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1
16345,-438.999143,-312.475845
16347,-437.428582,-334.316858
16348,-432.656618,-438.88666
16349,-429.781788,-411.289794
16351,-412.916125,-358.068636


In [32]:
expr.head()

Unnamed: 0_level_0,AFAP1,AFF4,AGAP1,AGO3,AGPS,AHDC1,AKAP11,ALPK2,AMOTL1,ANKH,...,USP34,USP8,USP9X,VPS13D,XDH,XKR5,YIPF4,ZBTB43,ZCCHC6,ZNF592
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16345,-4.813842,-3.188008,-3.295723,-4.979079,-4.386973,-4.217989,-4.550988,-3.851629,-4.496431,-4.706234,...,-3.845296,-3.461996,-4.57044,-4.146216,-3.05107,-4.715197,-4.041038,-3.420221,-4.964112,-5.160414
16347,-5.106487,-4.61945,-3.560939,-3.964551,-4.704805,-3.862786,-4.689495,-3.509033,-4.503137,-5.049473,...,-3.466787,-4.373514,-4.43982,-4.025982,-3.54344,-5.032331,-3.364482,-4.997824,-4.605185,-4.372846
16348,-4.807822,-3.234243,-2.778112,-4.207361,-4.122122,-5.429884,-4.000813,-4.313334,-4.445551,-3.748182,...,-4.067165,-4.417784,-4.647323,-4.274092,-3.749924,-4.630711,-3.068239,-4.092458,-4.596254,-4.137827
16349,-5.395198,-3.818007,-3.841525,-4.026636,-4.56692,-4.284435,-4.14543,-3.55676,-5.135194,-4.230258,...,-3.946126,-4.616071,-4.750019,-3.917865,-3.00137,-4.083341,-2.984517,-3.884148,-3.810366,-4.88662
16351,-5.138158,-3.386704,-3.14176,-3.895067,-4.088872,-4.111033,-4.520603,-4.003824,-4.135164,-4.294359,...,-3.835658,-4.958653,-4.630152,-3.84662,-3.891029,-5.6931,-3.879179,-3.836398,-3.604869,-5.261995
