# Alois :: Data Preparation

**Objectives:**
* Parse the source files and aggregate into 1 single Pandas DataFrame.
* Save the DataFrame to disk in Parquet format.

In [5]:
import pandas as pd
import numpy as np

import os

* input files (Mark Fiers)
* origin: `/staging/leuven/stg_00002/cbd/projects/BDS_SpatialTranscriptomics/ST_structure`

In [6]:
wd = '/media/tmo/data/work/datasets/02_ST/ST_structure/'
slide_file = wd + 'spatial_transcriptomics_slide.tsv'

* parquet folder for our merged DataFrame

In [7]:
parquet = '/media/tmo/data/work/datasets/02_ST/parquet/'
st_full = parquet + 'st_full'

In [8]:
slides = [
    'B02_D1', 'B02_E1', 'B03_C2', 'B03_D2', 'B04_D1',
    'B04_E1', 'B05_D2', 'B05_E2', 'B06_E1', 'B07_C2',
    'N02_C1', 'N02_D1', 'N03_C2', 'N03_D2', 'N04_D1',
    'N04_E1', 'N05_C2', 'N05_D2', 'N06_D2', 'N07_C1']

In [9]:
# SLIDE_META_COLUMNS = ['Sample id', 'GenotypeShort', 'Age (day)', 'Age (month)']
SLIDE_META_COLUMNS = ['sampleID', 'GT', 'age_days', 'age_months']

In [10]:
SPOT_META_COLUMNS = ['sampleID', 'AB1_StdDev_Yen', 'Region_predict',
                     'microglia', 'neuron', 'astrocyte', 'oligodendrocyte', 'endothelial', 'interneuron']

In [54]:
GT = 'GT'
YEN = 'AB1_StdDev_Yen'
REGION = 'Region_predict'
ASTRO = 'astrocyte'
AGE = 'age'
DAYS = 'age_days'
MONTHS = 'age_months'

SAMPLE_ID = 'sampleID'
SPOT_UID = 'spot_UID'

In [41]:
def to_slide(slide_or_idx):
    if isinstance(slide_or_idx, int):
        return slides[slide_or_idx]
    else:
        return slide_or_idx
    
def read_expression(slide_or_idx):
    """
    Read the RNA expression of 1 slide, specified by slide name or index.
    """
    
    file = '{0}{1}/expression/{1}.lcpm_znor.txt.gz'.format(wd, to_slide(slide_or_idx))
    
    df = pd.read_csv(file, sep='\t', index_col=0).astype(np.float32)
    df.index.name=SPOT_UID
    return df.reset_index()

def read_meta(slide_or_idx):
    """
    Read the metadata of 1 slide, specified by slide name or index.
    """
    
    file = '{0}{1}/metadata/{1}.meta.tsv'.format(wd, to_slide(slide_or_idx))
    
    df = pd.read_csv(file, sep='\t', index_col=0)
    df.index.name=SPOT_UID
    return df.reset_index()

def read_slides_meta(file=slide_file):
    """
    Read the .csv file with meta data about the slides.
    """
    
    COLUMNS = ['Sample id', 'GenotypeShort', 'Age (day)', 'Age (month)']
    
    df = pd.read_csv(file, sep='\t')[COLUMNS]
    df.columns = SLIDE_META_COLUMNS
    return df

In [13]:
slides_meta = read_slides_meta()

In [14]:
slides_meta.head()

Unnamed: 0,sampleID,GT,age_days,age_months
0,N01_D1,KI,106,4
1,N01_C1_r1,KI,106,4
2,N01_D2,KI,106,4
3,N01_C1,KI,106,4
4,N01_C2,KI,106,4


---
# Slide 00

In [42]:
%%time
ex00 = read_expression(0)

CPU times: user 40.9 s, sys: 824 ms, total: 41.7 s
Wall time: 41.7 s


In [43]:
ex00.shape

(582, 46455)

In [44]:
ex00.head()

Unnamed: 0,spot_UID,-343C11.2,00R_AC107638.2,0610005C13Rik,0610006L08Rik,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,...,n-R5s90,n-R5s92,n-R5s93,n-R5s94,n-R5s95,n-R5s96,n-R5s97,n-R5s98,n-TSaga9,n-TStga1
0,B02_D1__11_2,0.0,-0.140839,-0.137319,-0.069683,-0.346811,0.598446,-0.250064,-0.480542,-0.907225,...,-0.041451,0.0,-0.041451,0.0,-0.041451,-0.071736,-0.041451,0.0,0.0,-0.071903
1,B02_D1__27_2,0.0,-0.140839,-0.137319,-0.069683,0.560216,2.078606,-0.250064,-0.480542,1.215536,...,-0.041451,0.0,-0.041451,0.0,-0.041451,-0.071736,-0.041451,0.0,0.0,-0.071903
2,B02_D1__26_2,0.0,-0.140839,-0.137319,-0.069683,-0.302725,0.645292,-0.250064,-0.480542,-0.907225,...,-0.041451,0.0,-0.041451,0.0,-0.041451,-0.071736,-0.041451,0.0,0.0,-0.071903
3,B02_D1__30_2,0.0,-0.140839,-0.137319,-0.069683,0.068266,1.421688,-0.250064,-0.480542,-0.907225,...,-0.041451,0.0,-0.041451,0.0,-0.041451,-0.071736,-0.041451,0.0,0.0,-0.071903
4,B02_D1__12_2,0.0,-0.140839,-0.137319,-0.069683,0.528757,1.81583,-0.250064,-0.480542,-0.907225,...,-0.041451,0.0,-0.041451,0.0,-0.041451,-0.071736,-0.041451,0.0,0.0,-0.071903


In [45]:
ex00.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 582 entries, 0 to 581
Columns: 46455 entries, spot_UID to n-TStga1
dtypes: float32(46454), object(1)
memory usage: 103.1+ MB


In [46]:
%%time
meta00 = read_meta(0)

CPU times: user 76.9 ms, sys: 29 µs, total: 77 ms
Wall time: 75.9 ms


In [47]:
meta00.shape

(1155, 300)

In [49]:
meta00.head()

Unnamed: 0,spot_UID,spot_ID,spot_X,spot_Y,AB1_Mean_Yen,AB1_Median_Yen,AB1_IntDen_Yen,AB1_StdDev_Yen,AB1_area_Yen,AB1_Cell_Number_Yen,...,AT_FB_HY_200,filter,filter_200,Region_predict,microglia,neuron,astrocyte,oligodendrocyte,endothelial,interneuron
0,B02_D1__22_1,1,6479.730957,116.635033,0.0,0,0.0,0.0,0.0,0,...,0.0,1,1,unknown,,,,,,
1,B02_D1__33_1,2,9816.688477,120.182419,0.0,0,0.0,0.0,0.0,0,...,0.0,1,1,unknown,,,,,,
2,B02_D1__11_1,3,3141.386963,125.373672,0.0,0,0.0,0.0,0.0,0,...,0.008773,1,1,unknown,,,,,,
3,B02_D1__29_1,4,8612.235352,125.077896,0.0,0,0.0,0.0,0.0,0,...,0.005332,1,1,AT_HY,,,,,,
4,B02_D1__27_1,5,8018.597168,125.602509,0.0,0,0.0,0.0,0.0,0,...,0.003575,1,1,unknown,,,,,,


* check which columns are all zero -> NONE
* https://stackoverflow.com/questions/26053849/counting-non-zero-values-in-each-column-of-a-dataframe-in-python

In [17]:
ex00_count_nonzero = ex00.astype(bool).sum(axis=0)

In [18]:
ex00_count_nonzero_df =  pd.DataFrame(ex00_count_nonzero, columns=['count'])

In [19]:
ex00_count_nonzero_df[ex00_count_nonzero_df['count'] == 0].count()

count    0
dtype: int64

---

# Aggregate all into a unified DataFrame

In [60]:
def merge_dfs(ex_df, meta_df):
    return ex_df \
        .merge(meta_df, on=SPOT_UID, how='inner') \
        .merge(slides_meta, on=SAMPLE_ID)

In [61]:
merge_ex_meta(ex00, meta00).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 582 entries, 0 to 581
Columns: 46757 entries, spot_UID to age_months
dtypes: float32(46454), float64(234), int64(65), object(4)
memory usage: 104.5+ MB


In [None]:
acc = []
for idx in range(0,20):
    print("reading slide {}".format(slides[idx]))
    
    ex_df = read_expression(idx)
    meta_df = read_meta(idx)
    acc.append(merge_dfs(ex_df, meta_df))

reading slide B02_D1
reading slide B02_E1
reading slide B03_C2
reading slide B03_D2
reading slide B04_D1
reading slide B04_E1
reading slide B05_D2
reading slide B05_E2
reading slide B06_E1
reading slide B07_C2
reading slide N02_C1
reading slide N02_D1
reading slide N03_C2
reading slide N03_D2
reading slide N04_D1
reading slide N04_E1
reading slide N05_C2
reading slide N05_D2
reading slide N06_D2
reading slide N07_C1


In [66]:
all = pd.concat(acc)

In [67]:
all.shape

(10327, 46757)

* merge all data and write to Parquet
* `-rwxrwxrwx 1 root root 525366216 Mar  7 22:43 st_full`
* ~0.52 GB

In [69]:
%%time
all.to_parquet(st_full, engine='pyarrow', compression='snappy')

CPU times: user 28.8 s, sys: 5.02 s, total: 33.9 s
Wall time: 43.3 s
