In [1]:
import anndata
import pandas as pd
import pickle
import os

In [2]:
os.makedirs("../data/wot", exist_ok=True)

In [3]:
adata = anndata.read_h5ad("../data/after_scegot.h5ad")

Generate files in the same format as those in the wot tutorial from an AnnData containing dimensionality reduction and clustering information in scEGOT.

Specifically, create a file corresponding to the following file in the official tutorial on wot

```py
FLE_COORDS_PATH ='data/fle_coords.txt'
FULL_DS_PATH = 'data/ExprMatrix.h5ad'
VAR_GENE_DS_PATH = 'data/ExprMatrix.var.genes.h5ad'
CELL_DAYS_PATH = 'data/cell_days.txt'
CELL_SETS_PATH = 'data/cell_sets.gmt'
SERUM_CELL_IDS_PATH = 'data/serum_cell_ids.txt' # create by PGCLC data
BATCH_PATH = 'data/batches.txt'
TFS_PATH = 'data/TFs.txt'


```

Use the following files by copying the sample data files from the official wot into ../data/wot.

```py
GENE_SETS_PATH = 'data/gene_sets.gmx'
```

### fle_coords.txt

In [4]:
coord_df = pd.DataFrame(
    adata.obs.index,
    columns=["id"],
)
coord_df["x"] = adata.obsm["X_pca"]["PC1"].values
coord_df["y"] = adata.obsm["X_pca"]["PC2"].values

In [5]:
coord_df

Unnamed: 0,id,x,y
0,iM.data_GTGGAAGGTCAATGGG-1,-12.804817,-8.028358
1,iM.data_TTCATGTCAACCCGCA-1,-14.946176,-7.865564
2,iM.data_GAGGGTATCCAGGACC-1,-14.889571,-7.110969
3,iM.data_AAGTCGTAGGCTTTCA-1,-14.259319,-6.567074
4,iM.data_ACCGTTCGTAACTTCG-1,-13.354401,-5.247036
...,...,...,...
11766,d2b.data_AAGCCATAGGGCGAGA-1,30.793955,-8.877089
11767,d2b.data_CAACCAATCTTCCGTG-1,8.537253,18.523152
11768,d2b.data_AGGCCACGTGAGTAGC-1,23.786500,-2.583448
11769,d2b.data_GATCAGTTCGAGTACT-1,24.097697,-0.539524


In [6]:
coord_df.to_csv("../data/wot/fle_coords.txt", index=False, sep="\t")

### ExprMatrix.h5ad

In [7]:
expr_matrix = anndata.AnnData(adata.X)
expr_matrix.var.index = adata.var.index
expr_matrix.var["highly_variable"] = adata.var["highly_variable"].values
expr_matrix.obs.index = adata.obs.index

In [8]:
expr_matrix.write_h5ad("../data/wot/ExprMatrix.h5ad")

### ExprMatrix.var.genes.h5ad

In [9]:
expr_matrix_var_genes = anndata.AnnData(
    expr_matrix[:, expr_matrix.var["highly_variable"]].X
)
expr_matrix_var_genes.var.index = expr_matrix.var[
    expr_matrix.var["highly_variable"]
].index
expr_matrix_var_genes.obs.index = expr_matrix.obs.index

expr_matrix_var_genes.write_h5ad("../data/wot/ExprMatrix.var.genes.h5ad")

### cell_days.txt

In [10]:
days_df = pd.DataFrame(adata.obs.index, columns=["id"])
days_df["day"] = adata.obs["day_float"].values

In [11]:
days_df.to_csv("../data/wot/cell_days.txt", index=False, sep="\t")

### cell_sets.pkl

In [12]:
cell_type_dict = {}

for annotation in adata.obs["annotation"].values.unique():
    cell_names = adata[adata.obs["annotation"] == annotation].obs.index.tolist()
    cell_type_dict[annotation] = cell_names

In [13]:
with open("../data/wot/cell_sets.pkl", "wb") as f:
    pickle.dump(cell_type_dict, f)

### pgclc_cell_ids.txt

In [14]:
pgclc_cell_names = ["iMeLC", "PGCLC precursor", "PGCLC"]

In [15]:
pgclc_cell_ids = adata.obs.index[
    adata.obs["annotation"].isin(pgclc_cell_names)
].to_series()

In [16]:
pgclc_cell_ids.to_csv(
    "../data/wot/pgclc_cell_ids.txt", index=False, header=None, sep="\n"
)

### batches.txt

In [17]:
import random

batches = pd.DataFrame(adata.obs.index, columns=["id"])
batches["covariate"] = [random.randint(1, 2) for _ in range(adata.n_obs)]

In [18]:
batches["covariate"].value_counts()

covariate
2    5894
1    5877
Name: count, dtype: int64

In [19]:
batches.to_csv("../data/wot/batches.txt", index=False, sep="\t")

### TFs.txt

In [20]:
tf_genes = pd.read_csv("../data/TFgenes_name.csv", header=None, index_col=0).T

In [21]:
tf_genes

Unnamed: 0,cell
1,ZBTB8B
2,GSX2
3,TBX2
4,PAX8
5,CREB3L1
...,...
1564,CPEB1
1565,ZNF487
1566,NME2
1567,ZNF488


In [22]:
tf_genes.to_csv("../data/wot/TFs.txt", index=False, header=None, sep="\n")