In [2]:
import scanpy as sc
import pandas as pd
import argparse
import anndata as ad
import mygene
import matplotlib.pyplot as plt
import tarfile
import gzip
import os
import tempfile
from scipy import io
from scipy import sparse

In [3]:
def convert_to_h5ad(input_path, output_h5ad_path, extract_dir=None):
    """
    Converts:
    - a .txt.gz file,
    - a .tar archive containing .h5 files (10X),
    - a .tar archive containing .csv.gz files (counts matrices),
    to a .h5ad file.

    Parameters:
    - input_path: str, path to the .txt.gz or .tar file
    - output_h5ad_path: str, where to save the resulting .h5ad file
    - extract_dir: str, optional, directory to extract files (used for .tar)

    Returns:
    - AnnData object
    """
    def apply_qc(adata):
        adata.var_names_make_unique()
        adata.var['mt'] = adata.var_names.str.startswith('MT-')
        ribo_prefix = ("RPS", "RPL")
        adata.var['ribo'] = adata.var_names.str.startswith(ribo_prefix)
        sc.pp.calculate_qc_metrics(adata, qc_vars=['mt', 'ribo'], percent_top=None, log1p=False, inplace=True)
        adata.layers['counts'] = adata.X.copy()
        return adata

    if input_path.endswith('.txt.gz'):
        print(f"Detected .txt.gz file: {input_path}")
        df = pd.read_csv(input_path, sep='\t', index_col=0)
        print(f"Raw data shape: {df.shape}")
        adata = ad.AnnData(df.transpose())
        adata = apply_qc(adata)
        adata.write(output_h5ad_path)
        print(f"Saved AnnData to {output_h5ad_path}")
        return adata

    elif input_path.endswith('.tar'):
        print(f"Detected .tar archive: {input_path}")

        # Extract to a temporary directory
        extract_dir = tempfile.mkdtemp() if extract_dir is None else extract_dir
        print(f"Extracting {input_path} to {extract_dir}...")
        with tarfile.open(input_path, "r") as tar:
            tar.extractall(extract_dir)
        print("Extraction complete.")

        h5_files = []
        csv_files = []
        txt_gz_files = []

        for root, dirs, files in os.walk(extract_dir):
            for file in files:
                full_path = os.path.join(root, file)
                if file.endswith(".h5"):
                    h5_files.append(full_path)
                elif file.endswith(".csv.gz"):
                    if "counts" in file.lower():  # ✅ Only include files with 'counts'
                       csv_files.append(full_path)
                    else:
                        print(f"Skipping non-counts file: {file}")
                elif file.endswith(".txt.gz"):
                    txt_gz_files.append(full_path)

        adatas = []

        if h5_files:
            print(f"Found {len(h5_files)} h5 file(s):")
            for h5_file in h5_files:
                print(f" - {h5_file}")
                ad_obj = sc.read_10x_h5(h5_file)
                ad_obj = apply_qc(ad_obj)
                adatas.append(ad_obj)

        if csv_files:
            print(f"Found {len(csv_files)} csv.gz file(s):")
            for csv_file in csv_files:
                print(f" - {csv_file}")
                df = pd.read_csv(csv_file, index_col=0)
                print(f"   Loaded shape: {df.shape}")

                # Convert to sparse matrix to save RAM
                X_sparse = sparse.csr_matrix(df.values)

                # Build AnnData in sparse format
                ad_obj = ad.AnnData(X_sparse, obs=pd.DataFrame(index=df.index), var=pd.DataFrame(index=df.columns))
                ad_obj = apply_qc(ad_obj)
                adatas.append(ad_obj)

        if txt_gz_files:
            print(f"Found {len(txt_gz_files)} txt.gz file(s):")
            for txt_file in txt_gz_files:
                print(f" - {txt_file}")
                df = pd.read_csv(txt_file, sep='\t', index_col=0)
                ad_obj = ad.AnnData(df.transpose())
                ad_obj = apply_qc(ad_obj)
                adatas.append(ad_obj)

        # Combine all AnnData objects WITHOUT batch info
        if len(adatas) > 1:
            print("Merging multiple AnnData objects...")
            adata_combined = ad.concat(adatas, merge='same', join='outer')  # outer keeps all genes
        elif adatas:
            adata_combined = adatas[0]
        else:
            raise ValueError("No supported files found in the archive.")

        print(f"Saving combined AnnData to {output_h5ad_path}...")
        adata_combined.write(output_h5ad_path)
        print("Conversion complete.")
        return adata_combined


In [4]:
adata11 = convert_to_h5ad("GSE116555_raw_counts.txt.gz", "GSE116555_raw_counts.h5ad")

Detected .txt.gz file: GSE116555_raw_counts.txt.gz
Raw data shape: (18397, 5443)
Saved AnnData to GSE116555_raw_counts.h5ad


In [5]:
adata14 = convert_to_h5ad("GSE125416_RAW.tar", "GSE125416_RAW.h5ad")

Detected .tar archive: GSE125416_RAW.tar
Extracting GSE125416_RAW.tar to /tmp/tmpey70ls73...
Extraction complete.
Found 2 h5 file(s):
 - /tmp/tmpey70ls73/GSM3573650_N_filtered_gene_bc_matrices_h5.h5


  utils.warn_names_duplicates("var")


 - /tmp/tmpey70ls73/GSM3573649_D_filtered_gene_bc_matrices_h5.h5


  utils.warn_names_duplicates("var")


Merging multiple AnnData objects...


  utils.warn_names_duplicates("obs")


Saving combined AnnData to GSE125416_RAW.h5ad...
Conversion complete.


In [6]:
adata16 = convert_to_h5ad("GSE139550_RAW.tar", "GSE139550_RAW.h5ad")

Detected .tar archive: GSE139550_RAW.tar
Extracting GSE139550_RAW.tar to /tmp/tmp2jl2kxyf...
Extraction complete.
Skipping non-counts file: GSM4143592_Day4_preprocessed.csv.gz
Skipping non-counts file: GSM4143593_Day8_accutase_preprocessed.csv.gz
Skipping non-counts file: GSM4143594_Day8_suspension_preprocessed.csv.gz
Skipping non-counts file: GSM4143595_Day4_macro-coculture_preprocessed.csv.gz
Found 4 csv.gz file(s):
 - /tmp/tmp2jl2kxyf/GSM4143594_Day8_suspension_counts.csv.gz
   Loaded shape: (4961, 18901)




 - /tmp/tmp2jl2kxyf/GSM4143593_Day8_accutase_counts.csv.gz
   Loaded shape: (5555, 20445)




 - /tmp/tmp2jl2kxyf/GSM4143592_Day4_counts.csv.gz
   Loaded shape: (5253, 19909)




 - /tmp/tmp2jl2kxyf/GSM4143595_Day4_macro_co-culture_counts.csv.gz
   Loaded shape: (4320, 16787)




Merging multiple AnnData objects...


  utils.warn_names_duplicates("obs")


Saving combined AnnData to GSE139550_RAW.h5ad...
Conversion complete.


In [7]:
adata11

AnnData object with n_obs × n_vars = 5443 × 18397
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'
    var: 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    layers: 'counts'

In [8]:
adata14

AnnData object with n_obs × n_vars = 12493 × 33694
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'mt', 'ribo'
    layers: 'counts'

In [9]:
adata16

AnnData object with n_obs × n_vars = 20089 × 21999
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'
    layers: 'counts'

In [10]:
adata11.var_names

Index(['AL627309.1', 'AP006222.2', 'RP11-206L10.2', 'RP11-206L10.9',
       'LINC00115', 'FAM41C', 'RP11-54O7.1', 'RP11-54O7.2', 'RP11-54O7.3',
       'SAMD11',
       ...
       'TFF3', 'AP001626.2', 'AP001628.7', 'DNMT3L', 'AP001059.5', 'AIRE',
       'LRRC3-AS1', 'C21orf90', 'ITGB2-AS1', 'AP001469.7'],
      dtype='object', length=18397)

In [11]:
adata14.var_names

Index(['RP11-34P13.3', 'FAM138A', 'OR4F5', 'RP11-34P13.7', 'RP11-34P13.8',
       'RP11-34P13.14', 'RP11-34P13.9', 'FO538757.3', 'FO538757.2',
       'AP006222.2',
       ...
       'AC007325.2', 'BX072566.1', 'AL354822.1', 'AC023491.2', 'AC004556.1',
       'AC233755.2', 'AC233755.1', 'AC240274.1', 'AC213203.1', 'FAM231B'],
      dtype='object', length=33694)

In [12]:
adata16.var_names

Index(['A1BG', 'A1CF', 'A2M', 'A2ML1', 'A4GALT', 'A4GNT', 'AAAS', 'AACS',
       'AADAC', 'AADAT',
       ...
       'ZWILCH', 'ZWINT', 'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX',
       'ZZEF1', 'ZZZ3'],
      dtype='object', length=21999)

In [13]:
def fetch_gene_annotations(adata, species='human'):
    """
    Fetches gene annotations (chromosome, start, end, strand) from MyGeneInfo using either gene symbols
    or Ensembl IDs based on availability in adata.

    Parameters:
    - adata: AnnData object (should have either gene symbols in adata.var_names or Ensembl IDs in adata.var['gene_ids'])
    - species: str, species name (default: 'human')

    Returns:
    - query_df: pd.DataFrame with annotation info
    """
    mg = mygene.MyGeneInfo()

    # Check if Ensembl IDs are available in adata.var['gene_ids']
    if 'gene_ids' in adata.var:
        print("Using Ensembl IDs for querying.")
        gene_info = mg.querymany(
            adata.var['gene_ids'].tolist(),
            scopes='ensembl.gene',
            fields='genomic_pos,symbol',
            species=species
        )
    # If Ensembl IDs are not available, fall back to gene symbols in adata.var_names
    elif 'gene_ids' not in adata.var and len(adata.var_names) > 0:
        print("Using gene symbols for querying.")
        gene_info = mg.querymany(
            adata.var_names.tolist(),
            scopes='symbol',
            fields='genomic_pos,symbol',
            species=species
        )
    else:
        raise ValueError("Neither Ensembl IDs nor gene symbols found in adata.")

    # Convert to DataFrame
    query_df = pd.DataFrame(gene_info)

    # Safely extract genomic position info
    def extract_field(x, field):
        if isinstance(x, dict):
            return x.get(field)
        elif isinstance(x, list) and isinstance(x[0], dict):  # handle lists of dicts
            return x[0].get(field)
        else:
            return None

    query_df['chromosome'] = query_df['genomic_pos'].apply(lambda x: extract_field(x, 'chr'))
    query_df['start'] = query_df['genomic_pos'].apply(lambda x: extract_field(x, 'start'))
    query_df['end'] = query_df['genomic_pos'].apply(lambda x: extract_field(x, 'end'))
    query_df['strand'] = query_df['genomic_pos'].apply(lambda x: extract_field(x, 'strand'))

    # Check the first few rows
    print(query_df[['query', 'chromosome', 'start', 'end', 'strand']].head())

    return query_df

In [14]:
query_df_11 = fetch_gene_annotations(adata11)

INFO:biothings.client:querying 1-1000 ...


Using gene symbols for querying.


INFO:biothings.client:querying 1001-2000 ...
INFO:biothings.client:querying 2001-3000 ...
INFO:biothings.client:querying 3001-4000 ...
INFO:biothings.client:querying 4001-5000 ...
INFO:biothings.client:querying 5001-6000 ...
INFO:biothings.client:querying 6001-7000 ...
INFO:biothings.client:querying 7001-8000 ...
INFO:biothings.client:querying 8001-9000 ...
INFO:biothings.client:querying 9001-10000 ...
INFO:biothings.client:querying 10001-11000 ...
INFO:biothings.client:querying 11001-12000 ...
INFO:biothings.client:querying 12001-13000 ...
INFO:biothings.client:querying 13001-14000 ...
INFO:biothings.client:querying 14001-15000 ...
INFO:biothings.client:querying 15001-16000 ...
INFO:biothings.client:querying 16001-17000 ...
INFO:biothings.client:querying 17001-18000 ...
INFO:biothings.client:querying 18001-18397 ...
INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.


           query chromosome     start       end  strand
0     AL627309.1       None       NaN       NaN     NaN
1     AP006222.2       None       NaN       NaN     NaN
2  RP11-206L10.2       None       NaN       NaN     NaN
3  RP11-206L10.9       None       NaN       NaN     NaN
4      LINC00115          1  586945.0  827989.0    -1.0


In [15]:
query_df_14 = fetch_gene_annotations(adata14)

INFO:biothings.client:querying 1-1000 ...


Using Ensembl IDs for querying.


INFO:biothings.client:querying 1001-2000 ...
INFO:biothings.client:querying 2001-3000 ...
INFO:biothings.client:querying 3001-4000 ...
INFO:biothings.client:querying 4001-5000 ...
INFO:biothings.client:querying 5001-6000 ...
INFO:biothings.client:querying 6001-7000 ...
INFO:biothings.client:querying 7001-8000 ...
INFO:biothings.client:querying 8001-9000 ...
INFO:biothings.client:querying 9001-10000 ...
INFO:biothings.client:querying 10001-11000 ...
INFO:biothings.client:querying 11001-12000 ...
INFO:biothings.client:querying 12001-13000 ...
INFO:biothings.client:querying 13001-14000 ...
INFO:biothings.client:querying 14001-15000 ...
INFO:biothings.client:querying 15001-16000 ...
INFO:biothings.client:querying 16001-17000 ...
INFO:biothings.client:querying 17001-18000 ...
INFO:biothings.client:querying 18001-19000 ...
INFO:biothings.client:querying 19001-20000 ...
INFO:biothings.client:querying 20001-21000 ...
INFO:biothings.client:querying 21001-22000 ...
INFO:biothings.client:querying

             query chromosome    start      end  strand
0  ENSG00000243485          1  28589.0  31109.0     1.0
1  ENSG00000237613          1  34553.0  37595.0    -1.0
2  ENSG00000186092          1  65419.0  71585.0     1.0
3  ENSG00000238009       None      NaN      NaN     NaN
4  ENSG00000239945          1  89551.0  91105.0    -1.0


In [16]:
query_df_16 = fetch_gene_annotations(adata16)

INFO:biothings.client:querying 1-1000 ...


Using gene symbols for querying.


INFO:biothings.client:querying 1001-2000 ...
INFO:biothings.client:querying 2001-3000 ...
INFO:biothings.client:querying 3001-4000 ...
INFO:biothings.client:querying 4001-5000 ...
INFO:biothings.client:querying 5001-6000 ...
INFO:biothings.client:querying 6001-7000 ...
INFO:biothings.client:querying 7001-8000 ...
INFO:biothings.client:querying 8001-9000 ...
INFO:biothings.client:querying 9001-10000 ...
INFO:biothings.client:querying 10001-11000 ...
INFO:biothings.client:querying 11001-12000 ...
INFO:biothings.client:querying 12001-13000 ...
INFO:biothings.client:querying 13001-14000 ...
INFO:biothings.client:querying 14001-15000 ...
INFO:biothings.client:querying 15001-16000 ...
INFO:biothings.client:querying 16001-17000 ...
INFO:biothings.client:querying 17001-18000 ...
INFO:biothings.client:querying 18001-19000 ...
INFO:biothings.client:querying 19001-20000 ...
INFO:biothings.client:querying 20001-21000 ...
INFO:biothings.client:querying 21001-21999 ...
INFO:biothings.client:Finished

    query chromosome       start         end  strand
0    A1BG         19  58345178.0  58353492.0    -1.0
1    A1CF         10  50799409.0  50885675.0    -1.0
2     A2M         12   9067664.0   9116229.0    -1.0
3   A2ML1         12   8822621.0   8887001.0     1.0
4  A4GALT         22  42692121.0  42721298.0    -1.0


In [17]:
adata11.var

Unnamed: 0,mt,ribo,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts
AL627309.1,False,False,78,0.014330,98.566967,78
AP006222.2,False,False,3,0.000551,99.944883,3
RP11-206L10.2,False,False,17,0.003123,99.687672,17
RP11-206L10.9,False,False,36,0.006981,99.338600,38
LINC00115,False,False,76,0.015249,98.603711,83
...,...,...,...,...,...,...
AIRE,False,False,4,0.000735,99.926511,4
LRRC3-AS1,False,False,5,0.000919,99.908139,5
C21orf90,False,False,2,0.000367,99.963256,2
ITGB2-AS1,False,False,3,0.000551,99.944883,3


In [18]:
adata14.var

Unnamed: 0,gene_ids,mt,ribo
RP11-34P13.3,ENSG00000243485,False,False
FAM138A,ENSG00000237613,False,False
OR4F5,ENSG00000186092,False,False
RP11-34P13.7,ENSG00000238009,False,False
RP11-34P13.8,ENSG00000239945,False,False
...,...,...,...
AC233755.2,ENSG00000277856,False,False
AC233755.1,ENSG00000275063,False,False
AC240274.1,ENSG00000271254,False,False
AC213203.1,ENSG00000277475,False,False


In [19]:
adata16.var

A1BG
A1CF
A2M
A2ML1
A4GALT
...
ZYG11A
ZYG11B
ZYX
ZZEF1
ZZZ3


In [20]:
def merge_gene_positional_info(adata, query_df, gene_id_col=None):
    """
    Cleans the mygene query DataFrame and merges genomic location info into adata.var.

    Parameters:
    - adata: AnnData object to update.
    - query_df: DataFrame from mygene query with 'query', 'chromosome', 'start', 'end', 'strand', and 'symbol'.
    - gene_id_col: Optional str. Column name in adata.var to use for merging.
                   If None, uses adata.var_names (gene symbols).

    Returns:
    - Updated AnnData object.
    """

    # Clean up query_df: drop rows missing key positional info
    clean_query_df = query_df.dropna(subset=['chromosome', 'start', 'end'])

    # DROP DUPLICATES by 'symbol' to make sure we have a unique index for mapping
    clean_query_df = clean_query_df.drop_duplicates(subset=['symbol'])

    print(f"Cleaned query DataFrame has {len(clean_query_df)} unique symbols after filtering.")

    # If gene_id_col isn't provided, assume gene symbols are in var_names
    if gene_id_col is None:
        gene_id_col = 'gene_symbols'
        adata.var[gene_id_col] = adata.var_names

    # Set up a dictionary to map gene symbols to positional info
    positional_cols = ['chromosome', 'start', 'end', 'strand']
    for col in positional_cols:
        mapping = clean_query_df.set_index('symbol')[col]
        adata.var[col] = adata.var[gene_id_col].map(mapping)

    print("Merged gene positional info into adata.var. Here’s a preview:")
    print(adata.var.head())

    # Optionally drop the temporary column to keep things clean
    if gene_id_col == 'gene_symbols' and 'gene_symbols' not in adata.var_names:
        adata.var = adata.var.drop(columns=[gene_id_col])

    return adata


In [21]:
adata11 = merge_gene_positional_info(adata11, query_df_11)

Cleaned query DataFrame has 14489 unique symbols after filtering.
Merged gene positional info into adata.var. Here’s a preview:
                  mt   ribo  n_cells_by_counts  mean_counts  \
AL627309.1     False  False                 78     0.014330   
AP006222.2     False  False                  3     0.000551   
RP11-206L10.2  False  False                 17     0.003123   
RP11-206L10.9  False  False                 36     0.006981   
LINC00115      False  False                 76     0.015249   

               pct_dropout_by_counts  total_counts   gene_symbols chromosome  \
AL627309.1                 98.566967            78     AL627309.1        NaN   
AP006222.2                 99.944883             3     AP006222.2        NaN   
RP11-206L10.2              99.687672            17  RP11-206L10.2        NaN   
RP11-206L10.9              99.338600            38  RP11-206L10.9        NaN   
LINC00115                  98.603711            83      LINC00115          1   

            

In [22]:
adata14 = merge_gene_positional_info(adata14, query_df_14)

Cleaned query DataFrame has 25714 unique symbols after filtering.
Merged gene positional info into adata.var. Here’s a preview:
                     gene_ids     mt   ribo  gene_symbols chromosome    start  \
RP11-34P13.3  ENSG00000243485  False  False  RP11-34P13.3        NaN      NaN   
FAM138A       ENSG00000237613  False  False       FAM138A          1  34553.0   
OR4F5         ENSG00000186092  False  False         OR4F5          1  65419.0   
RP11-34P13.7  ENSG00000238009  False  False  RP11-34P13.7        NaN      NaN   
RP11-34P13.8  ENSG00000239945  False  False  RP11-34P13.8        NaN      NaN   

                  end  strand  
RP11-34P13.3      NaN     NaN  
FAM138A       37595.0    -1.0  
OR4F5         71585.0     1.0  
RP11-34P13.7      NaN     NaN  
RP11-34P13.8      NaN     NaN  


In [23]:
print(adata14.var_names[:10])

Index(['RP11-34P13.3', 'FAM138A', 'OR4F5', 'RP11-34P13.7', 'RP11-34P13.8',
       'RP11-34P13.14', 'RP11-34P13.9', 'FO538757.3', 'FO538757.2',
       'AP006222.2'],
      dtype='object')


In [24]:
adata16 = merge_gene_positional_info(adata16, query_df_16)

Cleaned query DataFrame has 17265 unique symbols after filtering.
Merged gene positional info into adata.var. Here’s a preview:
       gene_symbols chromosome       start         end  strand
A1BG           A1BG         19  58345178.0  58353492.0    -1.0
A1CF           A1CF         10  50799409.0  50885675.0    -1.0
A2M             A2M         12   9067664.0   9116229.0    -1.0
A2ML1         A2ML1         12   8822621.0   8887001.0     1.0
A4GALT       A4GALT         22  42692121.0  42721298.0    -1.0


In [25]:
def preprocess_and_analyze(ad, resolution=1.0, output_h5ad_path=None):
    """
    Preprocess and analyze scRNA-seq data in AnnData format. This includes quality control,
    filtering, normalization, PCA, clustering, and UMAP visualization.

    Parameters:
    - ad: AnnData object
    - resolution: Resolution parameter for the Leiden clustering (default is 1.0)

    Returns:
    - ad: The processed AnnData object after all steps
    """

    # Step 1: QC and filtering based on mitochondrial and ribosomal genes
    ad.var['mt'] = ad.var_names.str.startswith('MT-')
    ribo_prefix = ("RPS", "RPL")
    ad.var['ribo'] = ad.var_names.str.startswith(ribo_prefix)
    sc.pp.calculate_qc_metrics(ad, qc_vars=['mt', 'ribo'], percent_top=None, log1p=False, inplace=True)

    # Step 2: Filter out cells with more than 20% mitochondrial content
    ad = ad[ad.obs['pct_counts_mt'] < 20]

    # Step 3: Filter cells based on the number of genes detected
    sc.pp.filter_cells(ad, min_genes=500)

    # Step 4: Filter cells based on total counts (max 30,000 counts)
    sc.pp.filter_cells(ad, max_counts=30000)

    # Step 5: Filter genes that are detected in at least 3 cells
    sc.pp.filter_genes(ad, min_cells=3)

    # Step 6: Normalize the counts data
    ad.layers['counts'] = ad.X.copy()
    sc.pp.normalize_total(ad, target_sum=1e4)
    sc.pp.log1p(ad)
    ad.layers['lognorm'] = ad.X.copy()

    # Step 7: Identify highly variable genes
    sc.pp.highly_variable_genes(ad, min_mean=0.0125, max_mean=6, min_disp=0.25)

    # Step 8: Perform PCA on highly variable genes
    sc.tl.pca(ad, use_highly_variable=True)

    # Step 9: Compute neighbors based on PCA results
    sc.pp.neighbors(ad, n_neighbors=20, n_pcs=15)

    # Step 10: Perform Leiden clustering
    sc.tl.leiden(ad, resolution=resolution)
    ad.obs['cell_type'] = ad.obs['leiden']  # Assign clusters as 'cell_type'

    # Step 11: Perform UMAP dimensionality reduction
    sc.tl.umap(ad)

    # Step 12: Plot UMAP colored by cell type
    sc.pl.umap(ad, color="cell_type", legend_loc="on data", frameon=False, show=False)

    # Save UMAP plot based on output_h5ad_path if provided
    if output_h5ad_path:
        base_name = os.path.splitext(os.path.basename(output_h5ad_path))[0]
        umap_filename = f'umap_{base_name}.png'
    else:
        umap_filename = 'umap_clusters.png'

    plt.savefig(umap_filename)
    plt.close()

    # Save processed anndata as a new h5ad file
    ad.write(output_h5ad_path)

    return ad

In [26]:
adata11 = preprocess_and_analyze(adata11, resolution=1.2, output_h5ad_path="adata11_clean.h5ad")

  adata.obs["n_genes"] = number

 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
  sc.tl.leiden(ad, resolution=resolution)


In [27]:
adata14 = preprocess_and_analyze(adata14, resolution=1.2, output_h5ad_path="adata14_clean.h5ad")

  adata.obs["n_genes"] = number
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


In [28]:
adata16 = preprocess_and_analyze(adata16, resolution=1.2, output_h5ad_path="adata16_clean.h5ad")

  adata.obs["n_genes"] = number
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
