In [3]:
import scanpy as sc
import os
import requests
from tqdm import tqdm
from anndata import AnnData as ad
import pandas as pd
from celldreamer.paths import DATA_DIR

Read dataset 

In [4]:
def load_dataset(path, url, new_name=None):
    # Check if the file exists at the specified path
    if not os.path.exists(path):
        # If not, download the file from the URL
        print(f"Dataset not found at {path}. Downloading from {url}...")
        response = requests.get(url, stream=True)
        
        # Get the total file size for progress bar
        total_size_in_bytes = int(response.headers.get('content-length', 0))
        block_size = 1024 # 1 Kilobyte

        progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)

        with open(path, 'wb') as f:
            for data in response.iter_content(block_size):
                progress_bar.update(len(data))
                f.write(data)
        progress_bar.close()

        if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
            print("ERROR, something went wrong")
        
        print("Download complete.")
    
    # Rename the dataset file if new_name is provided
    if new_name:
        os.rename(path, new_name)
        path = new_name

    # Load the dataset using Scanpy
    dataset = sc.read_h5ad(path)
    return dataset

dataset_path = os.path.join(DATA_DIR, 'raw', 'hlca_core.h5ad')
dataset_url = 'https://datasets.cellxgene.cziscience.com/7a3f08f9-5d07-4ddd-a8fe-5967dd34f35f.h5ad'

adata = load_dataset(path=dataset_path, url=dataset_url, new_name=dataset_path)

In [5]:
adata.layers["X_counts"] = adata.raw.X.copy()

In [7]:
sc.pp.filter_genes(adata, min_cells=10)

Preprocess

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)

Exception origin:
  File "/home/icb/alessandro.palma/miniconda3/envs/celldreamer/lib/python3.10/site-packages/numba/core/types/functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")
[0m[0m
  init_rp_tree(data, dist, current_graph, leaf_array)
Exception origin:
  File "/home/icb/alessandro.palma/miniconda3/envs/celldreamer/lib/python3.10/site-packages/numba/core/types/functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")
[0m[0m
  init_random(n_neighbors, data, current_graph, dist, rng_state)
Exception origin:
  File "/home/icb/alessandro.palma/miniconda3/envs/celldreamer/lib/python3.10/site-packages/numba/core/types/functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")
[0m[0m[0m
  nn_descent_internal_low_memory_parallel(


## Train Test split 

In [None]:
# Generate random indices for train and test sets
total_samples = adata.n_obs
train_frac = 0.8
train_size = int(total_samples * train_frac)
train_indices = np.random.choice(total_samples, size=train_size, replace=False)
test_indices = np.setdiff1d(np.arange(total_samples), train_indices)

# Separate data into train and test sets by indexing the Anndata object
train_adata = adata[train_indices, :]
test_adata = adata[test_indices, :]

In [None]:
train_adata

In [None]:
test_adata

In [None]:
adata

## Save

In [None]:
sc.AnnData.write_h5ad(adata, DATA_DIR / "processed_full_genome" / "hlca_core" / "hlca_core.h5ad")

In [None]:
sc.AnnData.write_h5ad(train_adata, DATA_DIR / "processed_full_genome" / "hlca_core" / "hlca_core_train.h5ad")

In [None]:
sc.AnnData.write_h5ad(test_adata, DATA_DIR / "processed_full_genome" / "hlca_core" / "hlca_core_test.h5ad")