In [1]:
import pandas as pd
from os.path import join
import anndata
import scanpy as sc
import dask.dataframe as dd
import dask.array as da
import numpy as np
import pickle
import os
from self_supervision.paths import DATA_DIR

In [2]:
def get_count_matrix_and_obs(ddf):
    x = (
        ddf['X']
        .map_partitions(
            lambda xx: pd.DataFrame(np.vstack(xx.tolist())),
            meta={col: 'f4' for col in range(19331)}
        )
        .to_dask_array(lengths=[1024] * ddf.npartitions)
    )
    obs = ddf[['cell_type', 'tech_sample']].compute()

    return x, obs

In [3]:
STORE_DIR = os.path.join(DATA_DIR, 'merlin_cxg_2023_05_15_sf-log1p')

ddf_train = dd.read_parquet(join(STORE_DIR, 'train'), split_row_groups=True)
x_train, obs_train = get_count_matrix_and_obs(ddf_train)
print('Train data: ', x_train.shape)

Train data:  (15240192, 19331)


In [12]:
# randomly subsample train data
perc = 10
total_rows = x_train.shape[0]
rows_to_select = int(total_rows * (perc / 100))
random_indices = np.random.choice(total_rows, size=rows_to_select, replace=False)
x_train_sub = x_train[random_indices, ]

print('Subsampled data: ', x_train_sub.shape)

# Create a boolean mask to select the desired rows
mask = np.zeros(len(obs_train), dtype=bool)
mask[random_indices] = True
obs_train_sub = obs_train.iloc[mask]

adata = anndata.AnnData(X=x_train_sub, obs=obs_train_sub)

Subsampled data:  (1524019, 19331)


In [13]:
adata

AnnData object with n_obs × n_vars = 1524019 × 19331
    obs: 'cell_type', 'tech_sample'

In [None]:
for hvgs in [2000, 1000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]:
    

    out = sc.pp.highly_variable_genes(adata, n_top_genes=hvgs, inplace=False)

    hvg_indices = list(out.loc[out['highly_variable']].index)

    with open('hvg_' + str(hvgs) + '_indices.pickle', 'wb') as f:
        pickle.dump(list(hvg_indices), f)

In [4]:
import pickle

# Step 1: Load the pickle file
with open(DATA_DIR + 'hvg_indices1000.pickle', "rb") as f:
    hvg_indices = pickle.load(f)

# Step 2: Type and Length Check
if isinstance(hvg_indices, list) and len(hvg_indices) == 1000:
    print("The pickle file contains a list of 1000 indices.")
else:
    print(f"Unexpected content: Type-{type(hvg_indices)}, Length-{len(hvg_indices)}")

# Step 3: Index Range (optional)
if all(isinstance(index, int) for index in hvg_indices):
    print(f"Indices range from {min(hvg_indices)} to {max(hvg_indices)}")

The pickle file contains a list of 1000 indices.
Indices range from 15 to 19307
