In [7]:
import os

import anndata
from scegot import scEGOT
import pandas as pd
import numpy as np

In [8]:
DATASET_INPUT_ROOT_PATH = "../data"
ANNDATA_DATASET_PATH = os.path.join(
    DATASET_INPUT_ROOT_PATH, "scRNAseq_hPGCLC_induction_Saitou.h5ad"
)
# CSV_DATASET_FOLDER_PATH = os.path.join(DATASET_INPUT_ROOT_PATH, "usedataSmall/")
RANDOM_STATE = 2023
PCA_N_COMPONENTS = 150
GMM_CLUSTER_NUMBERS = [1, 2, 4, 5, 5]
UMAP_N_NEIGHBORS = 1000
DAY_NAMES = ["day0", "day0.5", "day1", "day1.5", "day2"]

In [9]:
input_data = anndata.read_h5ad(ANNDATA_DATASET_PATH)
scegot = scEGOT(
    input_data,
    verbose=True,  # default=True
    adata_day_key="cluster_day",
)

X, pca_model = scegot.preprocess(
    PCA_N_COMPONENTS,
    recode_params={},
    umi_target_sum=1e5,
    pca_random_state=RANDOM_STATE,
    pca_other_params={},
    apply_recode=True,
    apply_normalization_log1p=True,
    apply_normalization_umi=True,
    select_genes=True,
    n_select_genes=2000,
)

Processing AnnData...
Applying RECODE...
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': 15820, '#non-significant genes': 2582, '#silent genes': 65, 'ell': 288, 'Elapsed time': '0h 0m 23s 054ms', 'solver': 'randomized', '#test_data': 2354}
Applying UMI normalization...
Applying log1p normalization...
Applying PCA...
	sum of explained_variance_ratio = 93.67122272048897


In [10]:
day_list = [int(float(day[3:]) * 2) for day in input_data.obs["cluster_day"]]
list(set(day_list))

[0, 1, 2, 3, 4]

In [11]:
to_save = {
    "original_embedding_150d": pd.concat(scegot.X_pca).values,
    "sample_labels": np.array(day_list),
}
np.savez("../output/pgclc_150dim.npz", **to_save)