In [1]:
!date

Mon Jun 14 11:38:22 PDT 2021


## Import packages. 

In [4]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata

import matplotlib
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'

## Read in the data

In [5]:
adata = anndata.read_h5ad("2021-06-14_adata_analyzed.h5ad")
adata

AnnData object with n_obs × n_vars = 4840 × 33694
    obs: 'batch', 'sample_id', 'sample_number', 'cell_group', 'n_genes', 'n_counts', 'percent_mito', 'louvain', 'lineage', 'te_score', 'epi_score', 'hypo_score', 'sample_group', 'lineage_id', 'name', 'epsc_score'
    var: 'gene_ids', 'feature_types', 'gene_name', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'mean', 'std'
    uns: 'cell_group_colors', 'hvg', 'lineage_id_colors', 'louvain', 'name_colors', 'neighbors', 'pca', 'sample_group_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

### Make subsets of adata.

In [7]:
# Subset for EPI and ELCs
adata_epi = adata[adata.obs['lineage']== 'epi'] 
adata_epi = adata[(adata.obs['lineage_id']== 'Epiblast') | 
                  (adata.obs['lineage_id']== 'D5 ELCs') |  
                  (adata.obs['lineage_id']== 'D6 ELCs')] 

# Subset for HYPO and HLCs
adata_hypo = adata[adata.obs['lineage']== 'hypo']
adata_hypo = adata[(adata.obs['lineage_id']== 'Hypoblast') | 
                  (adata.obs['lineage_id']== 'D5 HLCs') |  
                  (adata.obs['lineage_id']== 'D6 HLCs')] 

# Subset for TE and TLCs
adata_te = adata[adata.obs['lineage']== 'te']
adata_te = adata[(adata.obs['lineage_id']== 'Trophectoderm') | 
                  (adata.obs['lineage_id']== 'D5 TLCs') |  
                  (adata.obs['lineage_id']== 'D6 TLCs')] 

### Make gene list for each lineage

In [10]:
# Reads in a list of genes for each lineage from from supplementary table 12 of Liu et al.
df_lineage = pd.read_csv('supp12.csv')
epi_markers = df_lineage.loc[df_lineage["type"]=="ALL-EPI"]["geneName"].values
hypo_markers = df_lineage.loc[df_lineage["type"]=="ALL-PE"]["geneName"].values
te_markers = df_lineage.loc[df_lineage["type"]=="ALL-TE"]["geneName"].values

# Makes sure that all genes in lineage gene lists are also present in the data set. 
epi_markers_present = []
for marker in epi_markers:
    if marker in adata.var["gene_name"]:
        epi_markers_present.append(marker)
        
hypo_markers_present = []
for marker in hypo_markers:
    if marker in adata.var["gene_name"]:
        hypo_markers_present.append(marker)
        
te_markers_present = []
for marker in te_markers:
    if marker in adata.var["gene_name"]:
        te_markers_present.append(marker)


___
## Mean, Std, and max expression values for HYPO, EPI, TE
___

### EPIBLAST

In [47]:
# Mean expression for genes in ELCs/Epiblast
epi_res_mean = pd.DataFrame(columns=adata_epi.var_names, index=adata_epi.obs['lineage_id'].cat.categories)                                                                                                 
for clust in adata_epi.obs.lineage_id.cat.categories: 
    epi_res_mean.loc[clust] = adata_epi[adata_epi.obs['lineage_id'].isin([clust]),:].X.mean(0)
    
# Dataframe with EPI related genes only.  
df_epi_res_mean = pd.DataFrame()
for i, marker in enumerate(epi_markers_present):
    df_epi_res_mean[marker] = epi_res_mean[marker]   
    
# # Transform matrix and sort values    
# df_epi_res_mean = df_epi_res_mean.T
# df_epi_res_mean = df_epi_res_mean.sort_values("Epiblast")

In [48]:
# Standard deviation for ELCs/Epiblast     
epi_res_std = pd.DataFrame(columns=adata_epi.var_names, index=adata_epi.obs['lineage_id'].cat.categories)                                                                                                 

for clust in adata_epi.obs.lineage_id.cat.categories: 
    epi_res_std.loc[clust] = adata_epi[adata_epi.obs['lineage_id'].isin([clust]),:].X.std(0)

# Dataframe with EPI related genes only.   
df_epi_res_std = pd.DataFrame()
for i, marker in enumerate(epi_markers_present):
    df_epi_res_std[marker] = epi_res_std[marker]
    
# # Transform matrix and sort values
# df_epi_res_std = df_epi_res_std.T
# df_epi_res_std = df_epi_res_std.sort_values("Epiblast")
# df_epi_res_std

### HYPOBLAST

In [49]:
# Mean expression for genes in HLCs/Hypoblast  
res_hypo_mean = pd.DataFrame(columns=adata_hypo.var_names, index=adata_hypo.obs['lineage_id'].cat.categories)                                                                                                 

for clust in adata_hypo.obs.lineage_id.cat.categories: 
    res_hypo_mean.loc[clust] = adata_hypo[adata_hypo.obs['lineage_id'].isin([clust]),:].X.mean(0)

# Dataframe with HYPO related genes only.      
df_hypo_res_mean = pd.DataFrame()
for i, marker in enumerate(hypo_markers_present):
    df_hypo_res_mean[marker] = res_hypo_mean[marker]

# # Transform matrix and sort values
# df_hypo_res_mean = df_hypo_res_mean.T
# df_hypo_res_mean = df_hypo_res_mean.sort_values("Hypoblast")
# df_hypo_res_mean

In [50]:
# Standard deviation in HLCs/Hypoblast     
res_hypo_std = pd.DataFrame(columns=adata_hypo.var_names, index=adata_hypo.obs['lineage_id'].cat.categories)                                                                                                 

for clust in adata_hypo.obs.lineage_id.cat.categories: 
    res_hypo_std.loc[clust] = adata_hypo[adata_hypo.obs['lineage_id'].isin([clust]),:].X.std(0)

# Dataframe with HYPO related genes only. 
df_hypo_res_std = pd.DataFrame()
for i, marker in enumerate(hypo_markers_present):
    df_hypo_res_std[marker] = res_hypo_std[marker]   
    
# # Transform matrix and sort values
# df_hypo_res_std = df_hypo_res_std.T
# df_hypo_res_std = df_hypo_res_std.sort_values("Hypoblast")
# df_hypo_res_std

### TROPHOBLAST

In [51]:
# Mean expression in TLCs/Trophectoderm     
res_te_mean = pd.DataFrame(columns=adata_te.var_names, index=adata_te.obs['lineage_id'].cat.categories)                                                                                                 

for clust in adata_te.obs.lineage_id.cat.categories: 
    res_te_mean.loc[clust] = adata_te[adata_te.obs['lineage_id'].isin([clust]),:].X.mean(0)
    
# Dataframe with TE related genes only. 
df_te_res_mean = pd.DataFrame()
for i, marker in enumerate(te_markers_present):
    df_te_res_mean[marker] = res_te_mean[marker] 
    
# # Transform matrix and sort values
# df_te_res_mean = df_te_res_mean.T
# df_te_res_mean = df_te_res_mean.sort_values("Trophectoderm")
# df_te_res_mean

In [52]:
# Standard deviation for TLCs/Trophectoderm   
res_te_std = pd.DataFrame(columns=adata_te.var_names, index=adata_te.obs['lineage_id'].cat.categories)                                                                                                 

for clust in adata_te.obs.lineage_id.cat.categories: 
    res_te_std.loc[clust] = adata_te[adata_te.obs['lineage_id'].isin([clust]),:].X.std(0)

# Dataframe with TE related genes only. 
df_te_res_std = pd.DataFrame()
for i, marker in enumerate(te_markers_present):
    df_te_res_std[marker] = res_te_std[marker]
    
# # Transform matrix and sort values
# df_te_res_std = df_te_res_std.T
# df_te_res_std = df_te_res_std.sort_values("Trophectoderm")
# df_te_res_std

In [53]:
# Concatenate the mean and std dataframes for each lineage. 
df_epi_data = pd.concat([df_res_mean, df_res_std])
df_hypo_data = pd.concat([df_hypo_res_mean, df_hypo_res_std])
df_te_data = pd.concat([df_te_res_mean, df_te_res_std])

# # Save as .csv
# df_epi_data.to_csv("data/epi_mean_std.csv")
# df_hypo_data.to_csv("data/hypo_mean_std.csv")
# df_te_data.to_csv("data/te_mean_std.csv")

In [54]:
# Shows counts for all lineages.
df_counts = pd.DataFrame((adata.obs.lineage_id.value_counts())).T
df_counts = df_counts[["D5 ELCs", "D6 ELCs","Epiblast", 
                       "D5 HLCs", "D6 HLCs", "Hypoblast", 
                       "D5 TLCs", "D6 TLCs","Trophectoderm", 
                       "Undefined"]]
# df_counts.to_csv("data/lineage_counts.csv")

In [13]:
%load_ext watermark
%watermark -v -p numpy,pandas,scanpy,anndata,jupyterlab,matplotlib

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
CPython 3.7.10
IPython 7.22.0

numpy 1.19.4
pandas 1.1.4
scanpy 1.7.2
anndata 0.7.5
jupyterlab 3.0.11
matplotlib 3.3.4
