In [6]:
import scanpy as sc
import celltypist
import pandas as pd
from scipy.io import mmread

# --- EPI ---

counts_epi = mmread("epi_counts.mtx").tocsc()
genes_epi = pd.read_csv("epi_genes.csv")
counts_epi = counts_epi.T
cells_epi = pd.read_csv("epi_cells.csv")
meta_epi = pd.read_csv("epi_metadata.csv", index_col=0)
assert all(meta_epi.index == cells_epi['Cell']), "Epi: Cell barcodes and metadata rows do not match!"

adata_epi = sc.AnnData(X=counts_epi, obs=meta_epi, var=pd.DataFrame(index=genes_epi['Gene']))
print(adata_epi)
adata_epi.write("epi_clean.h5ad")

# --- ENDO ---

counts_endo = mmread("endo_counts.mtx").tocsc()
genes_endo = pd.read_csv("endo_genes.csv")
counts_endo = counts_endo.T
cells_endo = pd.read_csv("endo_cells.csv")
meta_endo = pd.read_csv("endo_metadata.csv", index_col=0)
assert all(meta_endo.index == cells_endo['Cell']), "Endo: Cell barcodes and metadata rows do not match!"

adata_endo = sc.AnnData(X=counts_endo, obs=meta_endo, var=pd.DataFrame(index=genes_endo['Gene']))
print(adata_endo)
adata_endo.write("endo_clean.h5ad")

# --- FIB ---

counts_fib = mmread("fib_counts.mtx").tocsc()
genes_fib = pd.read_csv("fib_genes.csv")
counts_fib = counts_fib.T
cells_fib = pd.read_csv("fib_cells.csv")
meta_fib = pd.read_csv("fib_metadata.csv", index_col=0)
assert all(meta_fib.index == cells_fib['Cell']), "Fib: Cell barcodes and metadata rows do not match!"

adata_fib = sc.AnnData(X=counts_fib, obs=meta_fib, var=pd.DataFrame(index=genes_fib['Gene']))
print(adata_fib)
adata_fib.write("fib_clean.h5ad")


def preprocess_and_annotate(filename, model='Human_IPF_Lung.pkl'):
    # Load data
    adata = sc.read_h5ad(filename)
    
    # Normalize total counts per cell to 10,000
    sc.pp.normalize_total(adata, target_sum=1e4)
    
    # Log1p transform
    sc.pp.log1p(adata)
    
    # Annotate using CellTypist
    annotation_result = celltypist.annotate(adata, model=model, majority_voting=True)
    
    print(f"Top predictions for {filename}:")
    # This is a pandas Series with predicted cell types indexed by cell barcodes
    print(annotation_result.predicted_labels.head())
    print("\n")
    
    return annotation_result

# Paths to your AnnData files
epi_file = "epi_clean.h5ad"
endo_file = "endo_clean.h5ad"
fib_file = "fib_clean.h5ad"

# Run for each dataset
predictions_epi = preprocess_and_annotate(epi_file)
predictions_endo = preprocess_and_annotate(endo_file)
predictions_fib = preprocess_and_annotate(fib_file)





🔬 Input data has 9567 cells and 33694 genes
🔗 Matching reference genes in the model
🧬 3865 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 5889 cells and 33694 genes
🔗 Matching reference genes in the model
🧬 3865 features used for prediction


Top predictions for epi_clean.h5ad:
                              predicted_labels over_clustering  \
GSM3666096_AAACGGGCACCCAGTG-1             ATII               0   
GSM3666096_AAAGATGCAACGATCT-1             ATII               2   
GSM3666096_AAAGATGTCCCTCTTT-1         Ciliated               3   
GSM3666096_AAAGATGTCGCTAGCG-1             ATII               7   
GSM3666096_AACCATGCACGTAAGG-1         Ciliated              10   

                                   majority_voting  
GSM3666096_AAACGGGCACCCAGTG-1  Macrophage_Alveolar  
GSM3666096_AAAGATGCAACGATCT-1                 ATII  
GSM3666096_AAAGATGTCCCTCTTT-1             Ciliated  
GSM3666096_AAAGATGTCGCTAGCG-1                 ATII  
GSM3666096_AACCATGCACGTAAGG-1             Ciliated  




⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 10
🗳️ Majority voting the predictions
✅ Majority voting done!
🔬 Input data has 544 cells and 33694 genes
🔗 Matching reference genes in the model


Top predictions for endo_clean.h5ad:
                                  predicted_labels over_clustering  \
GSM3666096_AAACCTGTCCAAGCCG-1  Macrophage_Alveolar               0   
GSM3666096_AAACGGGTCAACGGGA-1          VE_Arterial               1   
GSM3666096_AAAGTAGCACAACGCC-1            VE_Venous               2   
GSM3666096_AACGTTGTCCGCGCAA-1            Lymphatic               3   
GSM3666096_ACACCAAAGTGTGAAT-1            VE_Venous               5   

                                   majority_voting  
GSM3666096_AAACCTGTCCAAGCCG-1  Macrophage_Alveolar  
GSM3666096_AAACGGGTCAACGGGA-1          VE_Arterial  
GSM3666096_AAAGTAGCACAACGCC-1     VE_Peribronchial  
GSM3666096_AACGTTGTCCGCGCAA-1            Lymphatic  
GSM3666096_ACACCAAAGTGTGAAT-1            VE_Venous  




🧬 3865 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 5
🗳️ Majority voting the predictions
✅ Majority voting done!


Top predictions for fib_clean.h5ad:
                                  predicted_labels over_clustering  \
GSM3666096_AGCTTGATCGTAGGTT-1        Myofibroblast               0   
GSM3666096_ATCGAGTTCACCGTAA-1  Macrophage_Alveolar               1   
GSM3666096_GAGTCCGCAGACGTAG-1           Fibroblast               2   
GSM3666096_GTACTTTCAAGTCTAC-1           Fibroblast               3   
GSM3666096_GTGAAGGCAGACGCTC-1           Fibroblast               4   

                                   majority_voting  
GSM3666096_AGCTTGATCGTAGGTT-1        Myofibroblast  
GSM3666096_ATCGAGTTCACCGTAA-1           Fibroblast  
GSM3666096_GAGTCCGCAGACGTAG-1  Macrophage_Alveolar  
GSM3666096_GTACTTTCAAGTCTAC-1           Fibroblast  
GSM3666096_GTGAAGGCAGACGCTC-1           Fibroblast  




In [7]:
# Save predictions as CSV for each dataset
predictions_epi.predicted_labels.to_csv("epi_predictions.csv")
predictions_endo.predicted_labels.to_csv("endo_predictions.csv")
predictions_fib.predicted_labels.to_csv("fib_predictions.csv")
