In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import os
import gc
import anndata as ad
import sys
import traceback
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from glob import glob
from pathlib import Path
import infercnvpy as cnv
import genomic_features as gf
import traceback
from biomart import BiomartServer
import mygene
from scipy.stats import zscore


In [None]:
adata_sc = sc.read_h5ad('../single_cell_int/adata_sc_int_outlier_genes.h5ad')

In [None]:
datasets = adata_sc.obs.batch_covariate.unique().tolist()

In [None]:
adata_sc.obsm['X_umap'] = adata_sc.obsm['X_umap_int']
sc.pl.umap(adata_sc, color='Level_1', frameon=False)

In [None]:
subset = sc.pp.subsample(adata_sc, fraction=0.1, copy=True)
raw_counts = subset.layers['raw'].toarray()
print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")
print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
print("-" * 50)


In [None]:
adata_sc.var_names = adata_sc.var.gene_name

In [None]:
sc.pp.combat(adata_sc, key='batch_covariate')

In [None]:
try:
    for dataset in datasets:
        print(f'\033[92mRunning on dataset {dataset}\033[0m')
    #     # Subset the data
        adata_temp = adata_sc[adata_sc.obs['batch_covariate'] == dataset].copy()
        print(f"Filtering genes in dataset {dataset} with minimum 5 cells")
        sc.pp.filter_genes(adata_temp, min_cells=5)
        # adata_temp.var_names = adata_temp.var.ensembl_id
        malignant_cells = ["Acinar Cell", "Ductal Cell", "Ductal Cell/Malignant"]
        adata_temp.obs["Reference"] = np.where(adata_temp.obs.Level_1.isin(malignant_cells), 'Potentially Malignant', 'Reference')
    #     # Normalize, log transform, and perform dimensionality reduction
        # sc.pp.normalize_total(adata_sc, target_sum=1e4)
        # sc.pp.log1p(adata_sc)
        print(f"Running PCA, neighbors, UMAP, and Leiden clustering for dataset {dataset}")
        sc.pp.pca(adata_temp, layer='raw')
        sc.pp.neighbors(adata_temp)
        sc.tl.umap(adata_temp)
        sc.tl.leiden(adata_temp)
        print(f'infering CNVs for {dataset}')
    #     # Run inferCNV
        cnv.tl.infercnv(
            adata_temp,
            reference_key="Reference",
            reference_cat="Reference",
            window_size=150,
        )

    #     # PCA and neighbor graph for CNV data
        print(f"Running PCA, neighbors, Leiden clustering for CNV data in dataset {dataset}")
        cnv.tl.pca(adata_temp)
        cnv.pp.neighbors(adata_temp)
        cnv.tl.leiden(adata_temp)

    #     # Run UMAP and CNV scoring
        print(f"Running UMAP and CNV scoring for dataset {dataset}")
        cnv.tl.umap(adata_temp)
        cnv.tl.cnv_score(adata_temp)

    #     # Save results
        print(f"Saving results for dataset {dataset}")
        output_dir = f"../inferCNV/{dataset}"
        os.makedirs(output_dir, exist_ok=True)
        adata_temp.write(f"{output_dir}/PDAC_{dataset}_inferCNV.h5ad")
        print('Saving Images')
    #     # Save chromosome heatmap
        cnv.pl.chromosome_heatmap(adata_temp, groupby="Level_1", save=f"{dataset}_chromosome_heatmap_labels_inferCNV.png")
        cnv.pl.chromosome_heatmap(adata_temp, groupby="cnv_leiden", dendrogram=True, save=f"{dataset}_chromosome_heatmap_cnvleiden_inferCNV.png")

    #     # Save combined UMAP plots
        try:
            fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(11, 11))
            ax4.axis("off")
            np.random.seed(0)
            random_indices = np.random.permutation(range(adata_temp.shape[0]))
            cnv.pl.umap(adata_temp[random_indices,:], color="cnv_leiden", ax=ax1, show=False, size=5)
            cnv.pl.umap(adata_temp[random_indices,:], color="cnv_score", ax=ax2, show=False, size=5)
            cnv.pl.umap(adata_temp[random_indices,:], color="Level_1", ax=ax3, size=5)
            fig.savefig(f"{dataset}_combined_umap.png")
        except Exception as e:
            print(f"An error occurred while saving UMAP plots for {dataset}: {e}")

        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 11), gridspec_kw={"wspace": 0.5})
        ax4.axis("off")
        np.random.seed(0)
        random_indices = np.random.permutation(list(range(adata_temp.shape[0])))
        sc.pl.umap(adata_temp[random_indices,:], color="cnv_leiden", ax=ax1, show=False, size=5)
        sc.pl.umap(adata_temp[random_indices,:], color="cnv_score", ax=ax2, show=False, size=5)
        sc.pl.umap(adata_temp[random_indices,:], color="Level_1", ax=ax3, size=5)
        fig.savefig(f"../inferCNV/{dataset}_combined_umap_transcriptomic.png")
        print(f'\033[91mCompleted inferring CNVs for {dataset}\033[0m')
except Exception as e:
    print(f"An error occurred in loop: {dataset}")
    traceback.print_exc()

#TODO remove commented code if not necessary

In [None]:
cnv.tl.infercnv(
    adata_sc,
    reference_key="batch_covariate",
    reference_cat="Peng_Normal_scRNA-seq",
    window_size=500, layer='raw'
)

In [None]:
cnv.pp.neighbors(adata_sc, use_rep='scpoli')
cnv.tl.leiden(adata_sc)
cnv.tl.umap(adata_sc)
cnv.tl.cnv_score(adata_sc)

In [None]:
cnv.pl.chromosome_heatmap(adata_sc, groupby="Level_1") #save="chromosome_heatmap_labels_inferCNV.png")
cnv.pl.chromosome_heatmap(adata_sc, groupby="cnv_leiden") #, dendrogram=True, save="chromosome_heatmap_cnvleiden_inferCNV.png")


In [None]:
cnv.pl.chromosome_heatmap(adata_sc, groupby="batch_covariate")

In [None]:
adata_sc.write('../inferCNV/adata_sc_inferCNV.h5ad')

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 11), gridspec_kw={"wspace": 0.75})
ax4.axis("off")
np.random.seed(0)
random_indices = np.random.permutation(range(adata_sc.shape[0]))
cnv.pl.umap(adata_sc[random_indices,:], color="cnv_leiden", ax=ax1, show=False, size=5)
cnv.pl.umap(adata_sc[random_indices,:], color="cnv_score", ax=ax2, show=False, size=5)
cnv.pl.umap(adata_sc[random_indices,:], color="Level_1", ax=ax3, size=5)

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 11), gridspec_kw={"wspace": 0.75})
ax4.axis("off")
np.random.seed(0)
random_indices = np.random.permutation(list(range(adata_sc.shape[0])))
sc.pl.umap(adata_sc[random_indices,:], color="cnv_leiden", ax=ax1, show=False, size=5)
sc.pl.umap(adata_sc[random_indices,:], color="cnv_score", ax=ax2, show=False, size=5)
sc.pl.umap(adata_sc[random_indices,:], color="Level_1", ax=ax3, size=5)
# fig.savefig(f"../inferCNV/{dataset}_combined_umap_transcriptomic.png")

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 11), gridspec_kw={"wspace": 0.75})
ax4.axis("off")
np.random.seed(0)
random_indices = np.random.permutation(list(range(adata_sc.shape[0])))
sc.pl.umap(adata_sc[random_indices,:], color="cnv_leiden", ax=ax1, show=False, size=5)
sc.pl.umap(adata_sc[random_indices,:], color="cnv_score", ax=ax2, show=False, size=5)
sc.pl.umap(adata_sc[random_indices,:], color="Level_1", ax=ax3, size=5)
# fig.savefig(f"../inferCNV/{dataset}_combined_umap_transcriptomic.png")

In [None]:
malignant_cells = ["Ductal Cell", "Ductal Cell/Malignant"]
adata_sc.obs["Reference"] = np.where(adata_sc.obs.Level_1.isin(malignant_cells), 'Potentially Malignant', 'Reference')

In [None]:
cnv.tl.infercnv(
    adata_sc,
    reference_key="Reference",
    reference_cat="Reference",
    window_size=1000, layer='log_norm'
)

cnv.pl.chromosome_heatmap(adata_sc, groupby="Level_1") #save="chromosome_heatmap_labels_inferCNV.png")
cnv.pl.chromosome_heatmap(adata_sc, groupby="batch_covariate") #, dendrogram=True, save="chromosome_heatmap_cnvleiden_inferCNV.png")


In [None]:
cnv.tl.infercnv(
    adata_sc,
    reference_key="Reference",
    reference_cat="Reference",
    window_size=50, layer='log_norm'
)

cnv.pl.chromosome_heatmap(adata_sc, groupby="Level_1") #save="chromosome_heatmap_labels_inferCNV.png")
cnv.pl.chromosome_heatmap(adata_sc, groupby="batch_covariate") #, dendrogram=True, save="chromosome_heatmap_cnvleiden_inferCNV.png")

In [None]:
cnv.tl.infercnv(
    adata_sc,
    reference_key="batch_covariate",
    reference_cat="Peng_Normal_scRNA-seq",
    window_size=150, layer='log_norm'
)

cnv.pl.chromosome_heatmap(adata_sc, groupby="Level_1") #save="chromosome_heatmap_labels_inferCNV.png")
cnv.pl.chromosome_heatmap(adata_sc, groupby="batch_covariate") #, dendrogram=True, save="chromosome_heatmap_cnvleiden_inferCNV.png")

In [None]:
cnv.pl.chromosome_heatmap(adata_sc[adata_sc.obs.Reference == 'Potentially Malignant'], groupby="Level_1") 

In [None]:
infercnv_dir = '../inferCNV/'

In [None]:
import glob

In [None]:
files = glob('../inferCNV/*/*.h5ad')

In [None]:
adatas = {}
for i in files:
    print(i)
    adatas[i] = sc.read_h5ad(i)

In [None]:
for i in adatas:
    print(i)
    sc.pl.umap(adatas[i], color=['cnv_score', 'Level_1', 'cnv_leiden'], frameon=False, wspace=0.5)

In [None]:
dict_cnv_score = {}
total_cells = []
for i in adatas:
    print(i)
    total_cells.append(adatas[i].shape[0])
    dict_cnv_score.update(dict(list(zip(adatas[i].obs.Dataset_Barcode, list(zip(adatas[i].obs['cnv_score'], adatas[i].obs.Dataset))))))
print(sum(total_cells))

In [None]:
df_cnv_scores = pd.DataFrame.from_dict(dict_cnv_score, orient='index')

df_cnv_scores['cnv_score_normalized'] = df_cnv_scores.groupby(1)[0].transform(zscore)

adata_sc.obs['cnv_score_znormalized'] = adata_sc.obs.Dataset_Barcode.map(dict(list(zip(df_cnv_scores.index, df_cnv_scores.cnv_score_normalized))))

In [None]:
sc.pl.umap(adata_sc, color=['cnv_score_znormalized', 'Level_1'], frameon=False, vmax=5, size=5)

In [None]:
for i in adatas:
    print(i)
    sc.pl.umap(adatas[i], color=['cnv_score', 'Level_1', 'cnv_leiden'], frameon=False, wspace=0.5)

In [None]:
cutoff_values = {
    '../inferCNV/Simeone_scRNA-seq/PDAC_Simeone_scRNA-seq_inferCNV.h5ad': 0.05,
    '../inferCNV/Steele_scRNA-seq/PDAC_Steele_scRNA-seq_inferCNV.h5ad': 0.06,
    '../inferCNV/Zhang_scRNA-seq/PDAC_Zhang_scRNA-seq_inferCNV.h5ad': 0.04,
    '../inferCNV/Peng_Normal_scRNA-seq/PDAC_Peng_Normal_scRNA-seq_inferCNV.h5ad': 0.04,
    '../inferCNV/Lin_scRNA-seq/PDAC_Lin_scRNA-seq_inferCNV.h5ad': 0.030,
    '../inferCNV/Ding_scRNA-seq/PDAC_Ding_scRNA-seq_inferCNV.h5ad': 0.020,
    '../inferCNV/Schlesinger_scRNA-seq/PDAC_Schlesinger_scRNA-seq_inferCNV.h5ad': 0.04,
    '../inferCNV/Peng_scRNA-seq/PDAC_Peng_scRNA-seq_inferCNV.h5ad': 0.05,
    '../inferCNV/Caronni_scRNA-seq/PDAC_Caronni_scRNA-seq_inferCNV.h5ad': 0.05,
    '../inferCNV/Lee_scRNA-seq/PDAC_Lee_scRNA-seq_inferCNV.h5ad': 0.05,
    '../inferCNV/Steele_Adj_Norm_scRNA-seq/PDAC_Steele_Adj_Norm_scRNA-seq_inferCNV.h5ad': 0.04
}

In [None]:
dict_infercnv_score_malignant = {}
for dataset_name, adata in adatas.items():
    cutoff = cutoff_values[dataset_name]  
    adata.obs['infercnv_score_malignant'] = adata.obs['cnv_score'] > cutoff
    dict_infercnv_score_malignant.update(dict(list(zip(adata.obs.Dataset_Barcode, adata.obs['infercnv_score_malignant']))))

In [None]:
adata_sc.obs['infercnv_score_malignant'] = adata_sc.obs.Dataset_Barcode.map(dict_infercnv_score_malignant)

In [None]:
adata_sc.obs['infercnv_score_malignant'] = adata_sc.obs['infercnv_score_malignant'].map({True: 'Malignant', False: 'Non-Malignant'})

In [None]:
random_indices = np.random.permutation(list(range(adata_sc.shape[0])))
sc.pl.umap(
    adata_sc[random_indices, :], color=['infercnv_score_malignant', 'Level_1'], frameon=False, vmax=5, size=5)

In [None]:
sc.pl.umap(
    adata_sc[random_indices, :], color=['infercnv_score_malignant'], groups='Malignant', frameon=False, vmax=5, size=5)

In [None]:
malignant_cell_types = ['Acinar Cell', 'Ductal Cell', 'Ductal Cell/Malignant']
adata_sc.obs['infercnv_score_malignant_refined'] = np.where(
    (adata_sc.obs['infercnv_score_malignant'] == 'Malignant') & 
    (adata_sc.obs['Level_1'].isin(malignant_cell_types)),
    'Malignant',
    'Non-Malignant'
)

In [None]:
sc.pl.umap(
    adata_sc[random_indices, :], color=['infercnv_score_malignant_refined'], groups='Malignant', frameon=False, vmax=5, size=5)

In [None]:
dict_infercnv_score = {}
for dataset_name, adata in adatas.items():
    dict_infercnv_score.update(dict(list(zip(adata.obs.Dataset_Barcode, adata.obs['cnv_score']))))

In [None]:
adata_sc.obs['cnv_score_abs'] = adata_sc.obs.Dataset_Barcode.map(dict_infercnv_score)

In [None]:
adata_sc.write('../single_cell_int/adata_sc_int_cnv.h5ad')

# look for single nuc

In [None]:
adata_sn = sc.read_h5ad('../single_nuc_int/adata_nuc_int_outlier_genes.h5ad')

In [None]:
datasets = adata_sn.obs.Dataset.unique()

In [None]:
adata_sn.obs['Level_1'] = adata_sn.obs['scpoli_labels'].copy()

In [None]:
sc.pl.umap(adata_sn, color=['Level_1', 'Dataset'], frameon=False, wspace=0.5)

In [None]:
try:
    for dataset in datasets:
        print(f'\033[92mRunning on dataset {dataset}\033[0m')
    #     # Subset the data
        adata_temp = adata_sn[adata_sn.obs['Dataset'] == dataset].copy()
        print(f"Filtering genes in dataset {dataset} with minimum 5 cells")
        sc.pp.filter_genes(adata_temp, min_cells=5)
        # adata_temp.var_names = adata_temp.var.ensembl_id
        malignant_cells = ["Acinar Cell", "Ductal Cell", "Malignant"]
        adata_temp.obs["Reference"] = np.where(adata_temp.obs.Level_1.isin(malignant_cells), 'Potentially Malignant', 'Reference')
    #     # Normalize, log transform, and perform dimensionality reduction
        # sc.pp.normalize_total(adata_sc, target_sum=1e4)
        # sc.pp.log1p(adata_sc)
        print(f"Running PCA, neighbors, UMAP, and Leiden clustering for dataset {dataset}")
        sc.pp.pca(adata_temp, layer='log_norm')
        sc.pp.neighbors(adata_temp)
        sc.tl.umap(adata_temp)
        sc.tl.leiden(adata_temp)
        print(f'infering CNVs for {dataset}')
    #     # Run inferCNV
        cnv.tl.infercnv(
            adata_temp,
            reference_key="Reference",
            reference_cat="Reference",
            window_size=150,
        )

    #     # PCA and neighbor graph for CNV data
        print(f"Running PCA, neighbors, Leiden clustering for CNV data in dataset {dataset}")
        cnv.tl.pca(adata_temp)
        cnv.pp.neighbors(adata_temp)
        cnv.tl.leiden(adata_temp)

    #     # Run UMAP and CNV scoring
        print(f"Running UMAP and CNV scoring for dataset {dataset}")
        cnv.tl.umap(adata_temp)
        cnv.tl.cnv_score(adata_temp)

    #     # Save results
        print(f"Saving results for dataset {dataset}")
        output_dir = f"../inferCNV/{dataset}"
        os.makedirs(output_dir, exist_ok=True)
        adata_temp.write(f"{output_dir}/PDAC_{dataset}_inferCNV.h5ad")
        print('Saving Images')
    #     # Save chromosome heatmap
        cnv.pl.chromosome_heatmap(adata_temp, groupby="Level_1", save=f"{dataset}_chromosome_heatmap_labels_inferCNV.png")
        cnv.pl.chromosome_heatmap(adata_temp, groupby="cnv_leiden", dendrogram=True, save=f"{dataset}_chromosome_heatmap_cnvleiden_inferCNV.png")

    #     # Save combined UMAP plots
        try:
            fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(11, 11))
            ax4.axis("off")
            np.random.seed(0)
            random_indices = np.random.permutation(range(adata_temp.shape[0]))
            cnv.pl.umap(adata_temp[random_indices,:], color="cnv_leiden", ax=ax1, show=False, size=5)
            cnv.pl.umap(adata_temp[random_indices,:], color="cnv_score", ax=ax2, show=False, size=5)
            cnv.pl.umap(adata_temp[random_indices,:], color="Level_1", ax=ax3, size=5)
            fig.savefig(f"{dataset}_combined_umap.png")
        except Exception as e:
            print(f"An error occurred while saving UMAP plots for {dataset}: {e}")

        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 11), gridspec_kw={"wspace": 0.5})
        ax4.axis("off")
        np.random.seed(0)
        random_indices = np.random.permutation(list(range(adata_temp.shape[0])))
        sc.pl.umap(adata_temp[random_indices,:], color="cnv_leiden", ax=ax1, show=False, size=5)
        sc.pl.umap(adata_temp[random_indices,:], color="cnv_score", ax=ax2, show=False, size=5)
        sc.pl.umap(adata_temp[random_indices,:], color="Level_1", ax=ax3, size=5)
        fig.savefig(f"../inferCNV/{dataset}_combined_umap_transcriptomic.png")
        print(f'\033[91mCompleted inferring CNVs for {dataset}\033[0m')
except Exception as e:
    print(f"An error occurred in loop: {dataset}")
    traceback.print_exc()

#TODO remove commented code if not necessary