# Alois :: Data Preparation

**Objectives:**
* Parse the source files and aggregate into 1 single Pandas DataFrame.
* Save the DataFrame to disk in Parquet format.

In [1]:
import pandas as pd
import numpy as np

import os

* input files (Mark Fiers)
* origin: `/staging/leuven/stg_00002/cbd/projects/BDS_SpatialTranscriptomics/ST_structure`

In [2]:
wd = '/media/tmo/data/work/datasets/02_ST/ST_structure/'
slide_file = wd + 'spatial_transcriptomics_slide.tsv'

* parquet folder for our merged DataFrame

In [3]:
parquet = '/media/tmo/data/work/datasets/02_ST/parquet/'
st_full = parquet + 'st_full'

In [4]:
slides = [
    'B02_D1', 'B02_E1', 'B03_C2', 'B03_D2', 'B04_D1',
    'B04_E1', 'B05_D2', 'B05_E2', 'B06_E1', 'B07_C2',
    'N02_C1', 'N02_D1', 'N03_C2', 'N03_D2', 'N04_D1',
    'N04_E1', 'N05_C2', 'N05_D2', 'N06_D2', 'N07_C1']

In [5]:
# SLIDE_META_COLUMNS = ['Sample id', 'GenotypeShort', 'Age (day)', 'Age (month)']
SLIDE_META_COLUMNS = ['sampleID', 'GT', 'age_days', 'age_months']

In [6]:
SPOT_META_COLUMNS = ['sampleID', 'AB1_StdDev_Yen', 'Region_predict',
                     'microglia', 'neuron', 'astrocyte', 'oligodendrocyte', 'endothelial', 'interneuron']

In [208]:
GT = 'GT'
YEN = 'AB1_StdDev_Yen'
REGION = 'Region_predict'
ASTRO = 'astrocyte'
AGE = 'age'
DAYS = 'age_days'
MONTHS = 'age_months'
SPOT_UID = 'spot_UID'

In [209]:
def to_slide(slide_or_idx):
    if isinstance(slide_or_idx, int):
        return slides[slide_or_idx]
    else:
        return slide_or_idx
    
def read_expression(slide_or_idx):
    """
    Read the RNA expression of 1 slide, specified by slide name or index.
    """
    
    file = '{0}{1}/expression/{1}.lcpm_znor.txt.gz'.format(wd, to_slide(slide_or_idx))
    
    df = pd.read_csv(file, sep='\t', index_col=0).astype(np.float32)
    df.index.name=SPOT_UID
    return df

def read_meta(slide_or_idx):
    """
    Read the metadata of 1 slide, specified by slide name or index.
    """
    
    file = '{0}{1}/metadata/{1}.meta.tsv'.format(wd, to_slide(slide_or_idx))
    
    df = pd.read_csv(file, sep='\t', index_col=0)
    df.index.name=SPOT_UID
    return df

def read_slides_meta(file=slide_file):
    """
    Read the .csv file with meta data about the slides.
    """
    
    COLUMNS = ['Sample id', 'GenotypeShort', 'Age (day)', 'Age (month)']
    
    df = pd.read_csv(file, sep='\t')[COLUMNS]
    df.columns = SLIDE_META_COLUMNS
    return df

In [12]:
slides_meta = read_slides_meta()

In [13]:
slides_meta.head()

Unnamed: 0,sampleID,GT,age_days,age_months
0,N01_D1,KI,106,4
1,N01_C1_r1,KI,106,4
2,N01_D2,KI,106,4
3,N01_C1,KI,106,4
4,N01_C2,KI,106,4


---
# Slide 00

In [10]:
%%time
ex00 = read_expression(0)

CPU times: user 39.5 s, sys: 736 ms, total: 40.2 s
Wall time: 40.2 s


In [11]:
ex00.shape

(582, 46454)

In [12]:
ex00.head()

Unnamed: 0_level_0,-343C11.2,00R_AC107638.2,0610005C13Rik,0610006L08Rik,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,0610010F05Rik,...,n-R5s90,n-R5s92,n-R5s93,n-R5s94,n-R5s95,n-R5s96,n-R5s97,n-R5s98,n-TSaga9,n-TStga1
spot_UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B02_D1__11_2,0.0,-0.140839,-0.137319,-0.069683,-0.346811,0.598446,-0.250064,-0.480542,-0.907225,0.00884,...,-0.041451,0.0,-0.041451,0.0,-0.041451,-0.071736,-0.041451,0.0,0.0,-0.071903
B02_D1__27_2,0.0,-0.140839,-0.137319,-0.069683,0.560216,2.078606,-0.250064,-0.480542,1.215536,-0.022151,...,-0.041451,0.0,-0.041451,0.0,-0.041451,-0.071736,-0.041451,0.0,0.0,-0.071903
B02_D1__26_2,0.0,-0.140839,-0.137319,-0.069683,-0.302725,0.645292,-0.250064,-0.480542,-0.907225,0.052802,...,-0.041451,0.0,-0.041451,0.0,-0.041451,-0.071736,-0.041451,0.0,0.0,-0.071903
B02_D1__30_2,0.0,-0.140839,-0.137319,-0.069683,0.068266,1.421688,-0.250064,-0.480542,-0.907225,1.047602,...,-0.041451,0.0,-0.041451,0.0,-0.041451,-0.071736,-0.041451,0.0,0.0,-0.071903
B02_D1__12_2,0.0,-0.140839,-0.137319,-0.069683,0.528757,1.81583,-0.250064,-0.480542,-0.907225,0.881945,...,-0.041451,0.0,-0.041451,0.0,-0.041451,-0.071736,-0.041451,0.0,0.0,-0.071903


In [80]:
ex00.info()

<class 'pandas.core.frame.DataFrame'>
Index: 582 entries, B02_D1__11_2 to B02_D1__26_28
Columns: 46454 entries, -343C11.2 to n-TStga1
dtypes: float64(46454)
memory usage: 206.3+ MB


In [13]:
%%time
meta00 = read_meta(0)

CPU times: user 73.7 ms, sys: 41 µs, total: 73.7 ms
Wall time: 72.6 ms


In [14]:
meta00.shape

(1155, 299)

In [15]:
meta00[SPOT_META_COLUMNS].head()

Unnamed: 0_level_0,sampleID,AB1_StdDev_Yen,Region_predict,microglia,neuron,astrocyte,oligodendrocyte,endothelial,interneuron
spot_UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
B02_D1__22_1,B02_D1,0.0,unknown,,,,,,
B02_D1__33_1,B02_D1,0.0,unknown,,,,,,
B02_D1__11_1,B02_D1,0.0,unknown,,,,,,
B02_D1__29_1,B02_D1,0.0,AT_HY,,,,,,
B02_D1__27_1,B02_D1,0.0,unknown,,,,,,


* check which columns are all zero -> NONE
* https://stackoverflow.com/questions/26053849/counting-non-zero-values-in-each-column-of-a-dataframe-in-python

In [17]:
ex00_count_nonzero = ex00.astype(bool).sum(axis=0)

In [18]:
ex00_count_nonzero_df =  pd.DataFrame(ex00_count_nonzero, columns=['count'])

In [19]:
ex00_count_nonzero_df[ex00_count_nonzero_df['count'] == 0].count()

count    0
dtype: int64

---

# Aggregate all into a unified DataFrame

In [14]:
acc = []
for idx in range(0,20):
    ex_df = read_expression(idx)
    meta_df = read_meta(idx)
    
    merged = ex_df \
        .merge(meta_df, how='inner', left_index=True, right_index=True) \
        .merge(slides_meta, on=['sampleID'])
    
    print("read slide {}".format(slides[idx]))
    
    acc.append(merged)

read slide B02_D1
read slide B02_E1
read slide B03_C2
read slide B03_D2
read slide B04_D1
read slide B04_E1
read slide B05_D2
read slide B05_E2
read slide B06_E1
read slide B07_C2
read slide N02_C1
read slide N02_D1
read slide N03_C2
read slide N03_D2
read slide N04_D1
read slide N04_E1
read slide N05_C2
read slide N05_D2
read slide N06_D2
read slide N07_C1


In [15]:
all = pd.concat(acc)

In [16]:
all.shape

(10327, 46756)

* merge all data and write to Parquet

In [18]:
%%time
all.to_parquet(st_full, engine='pyarrow', compression='snappy')

CPU times: user 29.4 s, sys: 5.08 s, total: 34.4 s
Wall time: 43.1 s


* read from parquet
* read time = **~10s** !!!