# 02 :: Parse meta information

**Objectives:**

* Read the meta data into a dataframe, save to parquet
* Properly turn columns into 

In [28]:
import pandas as pd
import numpy as np
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

In [130]:
raw_RData_path = '/media/tmo/data/work/datasets/02_ST/raw/raw_filtered.RData'
raw_parquet    = '/media/tmo/data/work/datasets/02_ST/raw/raw.parquet'
lcpm_parquet   = '/media/tmo/data/work/datasets/02_ST/lcpm/lcpm.parquet'
meta_parquet   = '/media/tmo/data/work/datasets/02_ST/meta/meta.parquet'

In [42]:
wd = '/media/tmo/data/work/datasets/02_ST/ST_structure/'
slides_meta_file = wd + 'spatial_transcriptomics_slide.tsv'

In [121]:
slide_IDs = [
    'B02_D1', 'B02_E1', 'B03_C2', 'B03_D2', 'B04_D1',
    'B04_E1', 'B05_D2', 'B05_E2', 'B06_E1', 'B07_C2',
    'N02_C1', 'N02_D1', 'N03_C2', 'N03_D2', 'N04_D1',
    'N04_E1', 'N05_C2', 'N05_D2', 'N06_D2', 'N07_C1']

def add_slide_ID(df):
    df['slide_ID'] = df['spot_UID'].apply(lambda x: str(x).split('__')[0])
    
    return df

def to_slide(slide_or_idx):
    if isinstance(slide_or_idx, int):
        return slide_IDs[slide_or_idx]
    else:
        return slide_or_idx

def read_slide_meta(slide_or_idx, wd=wd):
    """
    Read the metadata of 1 slide, specified by slide name or index.
    """
    
    file = '{0}{1}/metadata/{1}.meta.tsv'.format(wd, to_slide(slide_or_idx))
    
    df = pd.read_csv(file, sep='\t', index_col=0)
    df.index.name='spot_UID'
    
    print(df.shape)
    
    df = df.reset_index()
    
    # Add slide ID
    df = add_slide_ID(df)
    
    # Region categorical
    df['region'] = df['Region_predict'].astype('category')
    df.drop('Region_predict', axis=1)
    
    return df

def read_slides_meta(file=slides_meta_file):
    """
    Read the .csv file with meta data about the slides.
    """
    
    COLUMNS = ['Sample id', 'GenotypeShort', 'Age (day)', 'Age (month)']
    
    df = pd.read_csv(file, sep='\t')[COLUMNS]
    df.columns = ['sampleID', 'GT', 'age_days', 'age_months']
    
    # Add age column (young, old)
    df['age'] = np.where(df['age_months'] < 10, 'young', 'old')
    df['age'] = df['age'].astype('category')
    
    # Add combined column age_GT.
    df['age_gt'] = df[['age', 'GT']].apply(lambda x: '_'.join(x), axis=1)
    df['age_gt'] = df['age_gt'].astype('category')
    
    df['slide_ID'] = df['sampleID']
    df.drop('sampleID', axis=1)
            
    return pd.DataFrame(slide_IDs, columns=['slide_ID']).merge(df, how='inner', on='slide_ID')            

## Parse all meta data, combine and write to Parquet

In [92]:
slides_meta_df = read_slides_meta()

In [116]:
acc = []
for idx in range(0, 20):    
    acc.append(read_slide_meta(idx))

meta_df = pd.concat(acc)

In [127]:
all_meta_df = meta_df.merge(slides_meta_df, on=['slide_ID'])

In [129]:
all_meta_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23100 entries, 0 to 23099
Columns: 307 entries, spot_UID to age_gt
dtypes: category(2), float64(234), int64(65), object(6)
memory usage: 54.0+ MB


In [132]:
%%time
# all_meta_df.to_parquet(meta_parquet, engine='pyarrow', compression='snappy')

CPU times: user 0 ns, sys: 3 µs, total: 3 µs
Wall time: 6.2 µs
