In [2]:
import scanpy as sc
import re
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from matplotlib_venn import venn2

In [3]:
adata = sc.read_h5ad("infer_CNV_final_calls_ta.h5ad")
h5ad_filename = 'infer_CNV_final_calls_ta.h5ad'

In [4]:
adata

AnnData object with n_obs × n_vars = 10309 × 19186
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_mt', 'pct_counts_mt', 'n_genes', 'n_counts', 'cell_type', 'simulated_cnvs', 'leiden', 'cnv_leiden', 'cnv_score', 'cnv_calls'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'chromosome', 'start', 'end', 'strand', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'cell_type_colors', 'cnv', 'cnv_calls_colors', 'cnv_leiden', 'cnv_leiden_colors', 'cnv_neighbors', 'dendrogram_cnv_leiden', 'hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_cnv', 'X_cnv_pca', 'X_cnv_umap', 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts', 'lognorm'
    obsp: 'cnv_neighbors_connectivities', 'cnv_neighbors_distances', 'connectivities', 'distances'

In [5]:
adata.obs

Unnamed: 0,n_genes_by_counts,total_counts,total_counts_ribo,pct_counts_ribo,total_counts_mt,pct_counts_mt,n_genes,n_counts,cell_type,simulated_cnvs,leiden,cnv_leiden,cnv_score,cnv_calls
AAACCCAAGCGCCCAT-1,1004,1759.0,0.0,0.0,0.0,0.0,1004,1759.0,CD4 T cell,,0,6,0.007487,normal
AAACCCAAGGTTCCGC-1,4090,14227.0,0.0,0.0,0.0,0.0,4090,14227.0,Dendritic,,6,8,0.007981,normal
AAACCCACAGAGTTGG-1,1739,4205.0,0.0,0.0,0.0,0.0,1739,4205.0,CD14 monocyte,22:19807132-29743868 (CN 0),1,4,0.005639,normal
AAACCCACAGGTATGG-1,2119,4351.0,0.0,0.0,0.0,0.0,2119,4351.0,NK cell,,4,2,0.008430,tumor
AAACCCACATAGTCAC-1,1517,2815.0,0.0,0.0,0.0,0.0,1517,2815.0,B cell,,2,1,0.009757,tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTGCGTCGT-1,1242,2275.0,0.0,0.0,0.0,0.0,1242,2275.0,CD4 T cell,,0,2,0.008430,tumor
TTTGTTGGTGTCATGT-1,1243,2546.0,0.0,0.0,0.0,0.0,1243,2546.0,CD14 monocyte,"22:19807132-29743868 (CN 0), 6:25435484-350352...",1,0,0.009492,normal
TTTGTTGGTTTGAACC-1,1302,2462.0,0.0,0.0,0.0,0.0,1302,2462.0,CD8 T cell,,3,12,0.007620,normal
TTTGTTGTCCAAGCCG-1,1572,3086.0,0.0,0.0,0.0,0.0,1572,3086.0,CD4 T cell,X:106533974-112956833 (CN 4),0,7,0.007975,normal


In [6]:

def reformat_cnvs_to_binary(adata):
    """
    Create binary True/False columns based on:
    - 'formatted_cnv_calls': True if 'cnv_calls' == 'tumor', False otherwise.
    - 'formatted_simulated_cnvs': True if 'simulated_cnvs' matches either acceptable CNV format, False otherwise.
    """
    def matches_any_format(cnv_str):
        """
        Check if the CNV string matches either acceptable format:
        - 'chrom:start-end (gain/loss)'
        - 'chrom:start-end (CN 0/1/3/4)'
        """
        if not cnv_str or str(cnv_str).strip() == '':
            return False

        pattern1 = r"([XY0-9]+):(\d+)-(\d+)\s\((loss|gain)\)"
        pattern2 = r"([XY0-9]+):(\d+)-(\d+)\s\(CN [0134]\)"

        return bool(re.match(pattern1, str(cnv_str))) or bool(re.match(pattern2, str(cnv_str)))

    # formatted_cnv_calls: True if 'tumor', False otherwise
    adata.obs['formatted_cnv_calls'] = adata.obs['cnv_calls'].apply(lambda x: x == 'tumor')

    # formatted_simulated_cnvs: True if matches either acceptable format
    adata.obs['formatted_simulated_cnvs'] = adata.obs['simulated_cnvs'].apply(matches_any_format)

    print("First few rows of the new binary columns:")
    print(adata.obs[['cnv_calls', 'formatted_cnv_calls', 'simulated_cnvs', 'formatted_simulated_cnvs']].head())

    return adata

# Run function
adata = reformat_cnvs_to_binary(adata)



First few rows of the new binary columns:
                   cnv_calls formatted_cnv_calls               simulated_cnvs  \
AAACCCAAGCGCCCAT-1    normal               False                                
AAACCCAAGGTTCCGC-1    normal               False                                
AAACCCACAGAGTTGG-1    normal               False  22:19807132-29743868 (CN 0)   
AAACCCACAGGTATGG-1     tumor                True                                
AAACCCACATAGTCAC-1     tumor                True                                

                    formatted_simulated_cnvs  
AAACCCAAGCGCCCAT-1                     False  
AAACCCAAGGTTCCGC-1                     False  
AAACCCACAGAGTTGG-1                      True  
AAACCCACAGGTATGG-1                     False  
AAACCCACATAGTCAC-1                     False  


In [7]:
adata.obs

Unnamed: 0,n_genes_by_counts,total_counts,total_counts_ribo,pct_counts_ribo,total_counts_mt,pct_counts_mt,n_genes,n_counts,cell_type,simulated_cnvs,leiden,cnv_leiden,cnv_score,cnv_calls,formatted_cnv_calls,formatted_simulated_cnvs
AAACCCAAGCGCCCAT-1,1004,1759.0,0.0,0.0,0.0,0.0,1004,1759.0,CD4 T cell,,0,6,0.007487,normal,False,False
AAACCCAAGGTTCCGC-1,4090,14227.0,0.0,0.0,0.0,0.0,4090,14227.0,Dendritic,,6,8,0.007981,normal,False,False
AAACCCACAGAGTTGG-1,1739,4205.0,0.0,0.0,0.0,0.0,1739,4205.0,CD14 monocyte,22:19807132-29743868 (CN 0),1,4,0.005639,normal,False,True
AAACCCACAGGTATGG-1,2119,4351.0,0.0,0.0,0.0,0.0,2119,4351.0,NK cell,,4,2,0.008430,tumor,True,False
AAACCCACATAGTCAC-1,1517,2815.0,0.0,0.0,0.0,0.0,1517,2815.0,B cell,,2,1,0.009757,tumor,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTGCGTCGT-1,1242,2275.0,0.0,0.0,0.0,0.0,1242,2275.0,CD4 T cell,,0,2,0.008430,tumor,True,False
TTTGTTGGTGTCATGT-1,1243,2546.0,0.0,0.0,0.0,0.0,1243,2546.0,CD14 monocyte,"22:19807132-29743868 (CN 0), 6:25435484-350352...",1,0,0.009492,normal,False,True
TTTGTTGGTTTGAACC-1,1302,2462.0,0.0,0.0,0.0,0.0,1302,2462.0,CD8 T cell,,3,12,0.007620,normal,False,False
TTTGTTGTCCAAGCCG-1,1572,3086.0,0.0,0.0,0.0,0.0,1572,3086.0,CD4 T cell,X:106533974-112956833 (CN 4),0,7,0.007975,normal,False,True


In [8]:


def evaluate_cnv_predictions(adata, h5ad_filename):
    """
    Evaluate CNV predictions at the cell level by comparing predicted CNVs ('formatted_cnv_calls')
    to true CNVs ('formatted_simulated_cnvs') in the AnnData object.

    Prints confusion matrix, standard metrics, and saves a confusion matrix heatmap.

    Parameters:
    - adata: AnnData object containing 'formatted_simulated_cnvs' and 'formatted_cnv_calls' in .obs
    - h5ad_filename: string, the filename of the h5ad file (used to create the PNG name)
    """
    # Extract true and predicted CNV columns (they are already boolean)
    y_true = adata.obs['formatted_simulated_cnvs'].astype(int)
    y_pred = adata.obs['formatted_cnv_calls'].astype(int)

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("Confusion Matrix:")
    print(f"True Positives: {tp}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Negatives: {tn}")
    print()

    # Classification report
    report = classification_report(y_true, y_pred, target_names=['No CNV', 'CNV'], zero_division=0)
    print("Classification Report:")
    print(report)

    # Plot confusion matrix heatmap
    labels = ['No CNV', 'CNV']
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')

    # Create dynamic filename
    base_name = os.path.splitext(os.path.basename(h5ad_filename))[0]
    png_filename = f"{base_name}_confusion_matrix.png"

    # Save plot
    plt.tight_layout()
    plt.savefig(png_filename, dpi=300)
    plt.close()

    print(f"Confusion matrix saved as {png_filename}")

    # Return results as a dictionary
    return {
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn,
        'true_negatives': tn,
        'classification_report': report,
        'confusion_matrix_file': png_filename
    }


In [9]:
results_cell = evaluate_cnv_predictions(adata, h5ad_filename)

Confusion Matrix:
True Positives: 765
False Positives: 2560
False Negatives: 3232
True Negatives: 3752

Classification Report:
              precision    recall  f1-score   support

      No CNV       0.54      0.59      0.56      6312
         CNV       0.23      0.19      0.21      3997

    accuracy                           0.44     10309
   macro avg       0.38      0.39      0.39     10309
weighted avg       0.42      0.44      0.43     10309

Confusion matrix saved as infer_CNV_final_calls_ta_confusion_matrix.png
