# Endometriosis analysis
## Extract metadata from Tan et al 2022

In [1]:
import scrublet as scr
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import os
import sys
import scipy


def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)


sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable



-----
anndata     0.7.5
scanpy      1.7.1
sinfo       0.3.1
-----
PIL                 8.1.2
anndata             0.7.5
anyio               NA
attr                20.3.0
babel               2.9.0
backcall            0.2.0
brotli              NA
cairo               1.20.0
certifi             2020.12.05
cffi                1.14.5
chardet             4.0.0
cloudpickle         1.6.0
colorama            0.4.4
cycler              0.10.0
cython_runtime      NA
cytoolz             0.11.0
dask                2021.03.1
dateutil            2.8.1
decorator           4.4.2
fsspec              0.8.7
get_version         2.1
google              NA
h5py                3.1.0
idna                2.10
igraph              0.8.3
ipykernel           5.5.0
ipython_genutils    0.2.0
jedi                0.18.0
jinja2              2.11.3
joblib              1.0.1
json5               NA
jsonschema          3.2.0
jupyter_server      1.4.1
jupyterlab_server   2.3.0
kiwisolver          1.3.1
legacy_api_wrap     0.0.0


'/opt/conda/bin/python'

## All cells

In [2]:
path2datas = '/nfs/team292/lg18/endometriosis/other_datasets/endometriosis_Tan2022/'
adata = sc.read(path2datas+'endo-2022_global.h5ad')
adata.X.shape

(108497, 26636)

In [3]:
sampleID = [ i + '_' for i in adata.obs.PID ]
sampleID = [a + b for a, b in zip(sampleID, adata.obs.sample_type_rename.tolist())]
adata.obs['sample'] = sampleID

In [4]:
len(set(sampleID))

30

In [5]:
adata.obs.head()

Unnamed: 0_level_0,sequencing_saturation,stage,n_genes_by_counts,total_counts,pct_counts_mitochondrial,pct_counts_hemoglobin,celltype,celltype_main,sample_type_rename,PID,G2M_score,phase,sample
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAACCCAAGACCAAGC-1-0,77.036286,0,1748,4759.0,23.891575,0.0,fibroblast 1,stromal,EuE,E01,-0.044748,G1,E01_EuE
AAACCCACAACTCGTA-1-0,77.165551,0,4166,15553.0,11.091108,0.0,fibroblast 1,stromal,EuE,E01,-0.146015,G1,E01_EuE
AAACCCAGTAATCAGA-1-0,70.605258,0,4440,15541.0,14.374879,0.0,myeloid 3,myeloid,EuE,E01,-0.117621,G1,E01_EuE
AAACCCAGTTTGAACC-1-0,73.31133,0,3623,13019.0,13.572471,0.0,myeloid 1,myeloid,EuE,E01,-0.1849,G1,E01_EuE
AAACCCATCACAACCA-1-0,77.666907,0,4687,16137.0,6.358059,0.0,fibroblast 1,stromal,EuE,E01,-0.076091,G1,E01_EuE


In [6]:
adata.obs.to_csv('metadata-Tan2022.csv')

## Epithelial cells

In [7]:
adata = sc.read(path2datas+'endo-2022_epithelial.h5ad')
adata.X.shape

(19200, 24231)

In [8]:
sampleID = [ i + '_' for i in adata.obs.PID ]
sampleID = [a + b for a, b in zip(sampleID, adata.obs.sample_type_rename.tolist())]
adata.obs['sample'] = sampleID

In [9]:
adata.obs.head()

Unnamed: 0_level_0,sequencing_saturation,stage,n_genes_by_counts,total_counts,pct_counts_mitochondrial,pct_counts_hemoglobin,celltype,celltype_main,subtypes,sample_type_rename,PID,G2M_score,phase,sample
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AAAGGATCAAGTGCTT-1-0,73.772996,0,3012,8069.0,10.397819,0.0,epithelial 2,epithelial,glandular,EuE,E01,-0.201646,G1,E01_EuE
AAAGTGAAGATCACCT-1-0,76.680133,0,3547,12353.0,12.239942,0.0,epithelial 2,epithelial,lumenal 1,EuE,E01,-0.160264,G1,E01_EuE
AACACACTCATTATCC-1-0,76.339873,0,6934,40641.0,8.240447,0.0,epithelial 2,epithelial,glandular,EuE,E01,-0.208345,S,E01_EuE
AACGAAAAGGCACAAC-1-0,75.519513,0,5734,26836.0,22.525711,0.0,epithelial 2,epithelial,TP63+/KRT5+,EuE,E01,-0.162246,G1,E01_EuE
AACGGGATCAGGGATG-1-0,76.992985,0,8944,60479.0,13.234345,0.0,epithelial 2,epithelial,lumenal,EuE,E01,-0.188662,S,E01_EuE


In [10]:
adata.obs.to_csv('metadata-Tan2022_epithelials.csv')

## Endothelial cells

In [11]:
adata = sc.read(path2datas+'endo-2022_endothelial.h5ad')
adata.X.shape

(10751, 21391)

In [12]:
sampleID = [ i + '_' for i in adata.obs.PID ]
sampleID = [a + b for a, b in zip(sampleID, adata.obs.sample_type_rename.tolist())]
adata.obs['sample'] = sampleID

In [13]:
adata.obs.head()

Unnamed: 0_level_0,sequencing_saturation,stage,n_genes_by_counts,total_counts,pct_counts_mitochondrial,pct_counts_hemoglobin,celltype,celltype_main,subtypes,sample_type_rename,PID,G2M_score,phase,sample
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AACCTTTAGTATGACA-1-0,75.91732,0,5859,22801.0,10.324108,0.0,endothelial,endothelial,EC-HEV,EuE,E01,-0.083001,G1,E01_EuE
AACGGGAGTTCAAAGA-1-0,77.667494,0,4078,13320.0,7.004505,0.0,endothelial,endothelial,EC-HEV,EuE,E01,-0.104489,G1,E01_EuE
AAGACTCGTACGTAGG-1-0,75.781388,0,4926,17845.0,9.756234,0.0,endothelial,endothelial,EC-tip,EuE,E01,-0.076667,G1,E01_EuE
AAGCATCTCTCTTGCG-1-0,75.797038,0,4187,13809.0,13.194294,0.0,endothelial,endothelial,EC-HEV,EuE,E01,-0.118619,G1,E01_EuE
ACAAAGAGTCGTGCCA-1-0,74.414753,0,3757,11432.0,13.794611,0.0,endothelial,endothelial,LEC,EuE,E01,-0.070855,G1,E01_EuE


In [14]:
adata.obs.to_csv('metadata-Tan2022_endothelial.csv')

## Mesenchymal cells

In [15]:
adata = sc.read(path2datas+'endo-2022_stromal.h5ad')
adata.X.shape

(42713, 24401)

In [16]:
sampleID = [ i + '_' for i in adata.obs.PID ]
sampleID = [a + b for a, b in zip(sampleID, adata.obs.sample_type_rename.tolist())]
adata.obs['sample'] = sampleID

In [17]:
adata.obs.to_csv('metadata-Tan2022_mesenchymal.csv')