# Running moscot

This notebook depends on data generated by 'MG_06-26-2022_Seurat_object_to_anndata'. In that notebook, .RDS-Seurat objects were transformed into anndata objects. However, some annotations are not tranfered correctly from Exp@meta.data to adata.obs, hence the annotations are added manually here

In [1]:
import numpy as np
import anndata
import pandas as pd
import scanpy as sc
import scipy

In [2]:
!pip list

Package                       Version
----------------------------- -------------------------
absl-py                       1.1.0
aiohttp                       3.8.1
aiosignal                     1.2.0
altair                        4.2.0
anndata                       0.8.0
anyio                         3.6.1
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
asttokens                     2.0.5
async-timeout                 4.0.2
attrs                         21.4.0
Babel                         2.10.3
backcall                      0.2.0
backports.functools-lru-cache 1.6.4
beautifulsoup4                4.11.1
biomart                       0.9.2
bleach                        5.0.1
blinker                       1.5
brotlipy                      0.7.0
cachetools                    5.2.0
cellrank                      1.5.1
certifi                       2022.6.15
cffi                          1.15.1
charset-normalizer            2.1.0
chex    

In [3]:
Path="/home/icb/manuel.gander/moscotTime_Reproducibility/Data"
ts=['E3.5', 'E4.5', 'E5.25', 'E5.5', 'E6.25', 'E6.5', 'E6.75', 'E7.0', 'E7.25', 'E7.5', 'E7.75', 'E8.0', 'E8.25', 'E8.5a', 'E8.5b', 'E9.5', 'E10.5', 'E11.5', 'E12.5', 'E13.5']

# Fix annotation

Some annotations are incompletely tranfered, e.g. the cell types are transfered as numbers instead of their names ('E7:Primitve erythroid cells). This is fixed here. Additionally, the ENSEMBLE gene names are translated into gene symbols for downstream processes. 

In [4]:
#  This dictionary to translate Ensemble gene names into gene symbols was was created using biomart, see
###  Reproducibility_TOME/Notebooks/RDS_to_anndata/MG_03-31-2022_Ensemble_to_gene_symbol.ipynb

D=np.load(f'{Path}/Miscellaneous/ENS_to_genes.npy', allow_pickle=True)
Gene_Dict = dict(enumerate(D.flatten(), 1))[1]

### See, columns do not contain cell state annotation

In [5]:
i=0
ts0=ts[i]
print(ts0)
adata=sc.read(f"{Path}/anndatas/Initial_anndatas/adata_{ts0}.h5ad")

E3.5


In [6]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,sample,day,group,cell_state,cell_type
E3.5_P8_Cell2_embryo1_single,0,823281.0,5049,10,0,351.0,0,0
E3.5_P8_Cell3_embryo2_single,0,752208.0,5865,11,0,351.0,0,0
E3.5_P8_Cell5_embryo3_single,0,696715.0,5390,12,0,351.0,0,0
E3.5_P8_Cell7_embryo4_single,0,1365291.0,4028,18,0,351.0,0,0
E3.5_P8_Cell22_embryo5_single,0,1111620.0,4812,9,0,351.0,0,0
...,...,...,...,...,...,...,...,...
E3.5_P11.P6.P10_cell185_embryo_single,0,931741.0,5377,4,0,351.0,0,0
E3.5_P11.P6.P10_cell186_embryo_single,0,579382.0,6413,5,0,351.0,0,0
E3.5_P11.P6.P10_cell187_embryo_single,0,464591.0,3733,6,0,351.0,0,0
E3.5_P11.P6.P10_cell189_embryo_single,0,557666.0,4388,7,0,351.0,0,0


# Scripts

In [7]:
def add_data_generator(adata, ts0):
    # Identify study that created the data

    z=np.full(len(adata), '___________________')
    for i in range(0, len(adata)):
        a=adata.obs['cellID'][i]
        if a[2]=='.':
            z[i]='Mohammed'
        elif a[:2]=='EB':
            z[i]='Cheng'
        elif a[:4]=='cell':
            z[i]='Pijuan'
        elif a[0]=='P':
            z[i]='Qiu'
        elif a[:4]=='sci3':
            z[i]='Cao'
        else:
            # In case this happens, the data generator study could not be identified using cellID. In that case,
            # the adata.obs['origin'] will be corrupt!!!
            print(f'error: for time point={ts0}, i={i} there has been a problem with origin identifier')
    adata.obs['origin']=z
    return(adata)

In [8]:
def ENSEMBLE_name_to_gene_symbol(adata, Gene_Dict):
    #############           Transfrom ENSEMBLE names into gene symbols              ##########################

    # This asigns each ENSEMBLE name it's gene name. If the corresponding gene name is not found, then the
    # ENSEMBLE name is added in the gene name row
    
    ENS=[a for a in adata.var_names].copy()
    k=0
    l=len(ENS)
    genes=np.full(l, '_______________________________-')
    for i in range(0,l):
        if ENS[i] in Gene_Dict.keys() and Gene_Dict[f'{ENS[i]}']!='':
            genes[i]=Gene_Dict[f'{ENS[i]}']
        else:
            genes[i]=ENS[i]
            k=k+1
            
    if i==0:
        print(f'{k} out of {l} Ensemble IDs were not assigned a gene name')
    adata.var['gene_names']=genes
    return(adata)

In [9]:
def fix_annotation(ts0, Gene_Dict):
    adata=sc.read(f"{Path}/anndatas/Initial_anndatas/adata_{ts0}.h5ad")
    adata.X=scipy.sparse.csr_matrix(adata.X, dtype=np.float32)

    adata=sc.read(f"{Path}/anndatas/Initial_anndatas/adata_{ts0}.h5ad")
    adata.X=scipy.sparse.csr_matrix(adata.X, dtype=np.float32)

    ######  Fix annotations using Seurat-metadata               ################################################
    meta=pd.read_csv(f"{Path}/metadata/meta_{ts0}.csv", sep= ",")
    
    # Check if cells are ordered the same, was the case for all time points
    if [a for a in adata.obs.index]==[a for a in meta['Unnamed: 0']]:
        pass
    else:
        print('Error: cells are not ordered the same!!!')

    # Reconstruct the annotations using the Seurat-Metadata
    del adata.obs
    adata.obs['cellID']=list(meta['Unnamed: 0'])

    # For day 8.5 Qiu et al created an additional data set
    if ts0=='E8.5a' or ts0=='E8.5b':
        a=8.5
        adata.obs['day']=a
    else:
        adata.obs['day']=float(ts0[1:])

    adata.obs['cell_state']=list(meta['cell_state'])
    adata.obs['cell_type']=list(meta['cell_type'])
    adata.obs['group']=list(meta['group'])
    adata.obs['sample']=list(meta['sample'])
    
    adata=add_data_generator(adata, ts0)
    adata=ENSEMBLE_name_to_gene_symbol(adata, Gene_Dict)

    
    # I added this to remove a strange indexing error
    adata.__dict__['_raw'].__dict__['_var'] = adata.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
    
    adata.raw = adata
    
    # Log-normalize the expression
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    return(adata)

# Loop

In [10]:
for i in range(20):
    ts0=ts[i]
    print(ts0)
    
    adata=fix_annotation(ts0, Gene_Dict)

    adata.write(f"{Path}/anndatas/adata_{ts0}.h5ad", compression='gzip')

E3.5
E4.5
E5.25
E5.5
E6.25
E6.5
E6.75
E7.0
E7.25
E7.5
E7.75
E8.0
E8.25
E8.5a
E8.5b
E9.5
E10.5
E11.5
E12.5
E13.5
