In [1]:
import scanpy as sc
import h5py
import pandas as pd

In [2]:

path ='../data/'
# Paths for the filtered feature and molecule info files
filtered_feature_paths = [
    "filtered_feature_bc_matrix.h5",
    "filtered_feature_bc_matrix_Sample2.h5",
    "filtered_feature_bc_matrix_Sample3.h5",
    "filtered_feature_bc_matrix_Sample4.h5"
]

molecule_info_paths = [
    "molecule_info.h5",
    "molecule_info_Sample2.h5",
    "molecule_info_Sample3.h5",
    "molecule_info_Sample4.h5"
]

# Load each filtered feature file as an AnnData object and add sample ID
anndata_objects = []
mi_objects = []
for i, (ff_path, mi_path) in enumerate(zip(filtered_feature_paths, molecule_info_paths), start=1):
    # Load the filtered feature matrix
    adata = sc.read_10x_h5(path + ff_path)
    
    adata.obs['sample'] = f"{i}"
#     adata.var_names_make_unique()
    adata.obs.index = [f'{i}_{x}' for x in adata.obs.index]
    
    # Load additional information from molecule_info.h5
    with h5py.File(path + mi_path, 'r') as f:
        # Here we load data from molecule_info as an example
        # Adjust this part depending on what specific information you need
        mi_objects.append(f)

    # Add the AnnData object to the list
    anndata_objects.append(adata)
    display(anndata_objects[i-1].var.index)

# Display each AnnData object
for i, adata in enumerate(anndata_objects, start=1):
    print(f"Sample {i}: {adata}")


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Index(['MIR1302-10', 'FAM138A', 'OR4F5', 'RP11-34P13.7', 'RP11-34P13.8',
       'AL627309.1', 'RP11-34P13.14', 'RP11-34P13.9', 'AP006222.2',
       'RP4-669L17.10',
       ...
       'KIR3DL2', 'AL590523.1', 'CT476828.1', 'PNRC2', 'SRSF10', 'AC145205.1',
       'BAGE5', 'CU459201.1', 'AC002321.2', 'AC002321.1'],
      dtype='object', length=32738)

Sample 1: AnnData object with n_obs × n_vars = 8743 × 32738
    obs: 'sample'
    var: 'gene_ids', 'feature_types', 'genome'


In [None]:
import scanpy as sc

# Set scanpy settings for reproducibility and high-quality images
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# Process each AnnData object in anndata_objects list separately
for i, adata in enumerate(anndata_objects, start=1):
    # Basic QC metrics
#     sc.pp.calculate_qc_metrics(adata, inplace=True)
    
    # Filter cells and genes
    sc.pp.filter_cells(adata, min_counts=50)
    sc.pp.filter_genes(adata, min_cells=5)
    
    # Normalize and log transform
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    
    # Identify highly variable genes
    sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=2000)
    
    # Scale the data and apply PCA
    sc.pp.scale(adata, max_value=10)
    sc.tl.pca(adata, svd_solver='arpack')
    
    # Build the neighborhood graph
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
    
    # Run UMAP for dimensionality reduction
    sc.tl.umap(adata)
    
    # Perform Leiden clustering
    sc.tl.leiden(adata, resolution=1.0)
    
    # Find marker genes for each cluster
    sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
    
    # Visualize UMAP and clusters for each sample
    print(f"Visualizations for Sample {i}")
    sc.pl.umap(adata, color=['leiden'], legend_loc='on data', title=f"Sample {i} - UMAP with Leiden Clusters")
    sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False, title=f"Sample {i} - Marker Genes")

    # Optional: Save each processed AnnData object if needed
    # adata.write(f"processed_sample_{i}.h5ad")



-----
anndata     0.9.2
scanpy      1.9.5
-----
PIL                         9.0.1
appnope                     0.1.2
asciitree                   NA
asttokens                   NA
backcall                    0.2.0
bottleneck                  1.3.6
cffi                        1.15.0
cloudpickle                 2.0.0
colorama                    0.4.4
cycler                      0.10.0
cython_runtime              NA
cytoolz                     0.11.0
dask                        2022.02.1
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   4.4.2
defusedxml                  0.7.1
entrypoints                 0.4
executing                   0.8.3
fasteners                   0.17.3
fsspec                      2022.02.0
gmpy2                       2.1.2
google                      NA
h5py                        3.6.0
igraph                      0.9.11
importlib_resources         NA
ipykernel                   6.9.1
ipython_genutils            0.2.0
jedi

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes


  disp_grouped = df.groupby('mean_bin')['dispersions']


    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 40


OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
