In [None]:
import snapatac2 as snap
snap.__version__

import sys,os
import pandas as pd
import subprocess
import numpy as np
from tqdm import tqdm
from IPython.display import HTML
import scanpy as sc
import seaborn as sns

import anndata as ad
import networkx as nx
#import scvelo as scv
import scglue
from matplotlib import rcParams

* Create the cell by gene activity matrix


### import fragment data

doublet = pd.read_csv('/media/AnalysisTempDisk/HIV/03QC_data/all-QC-scDblFinder-0722-new.csv',index_col=0)
doubletList = doublet.loc[doublet['scDblFinder.class'] == 'doublet'].index.tolist()
doubletList

meta_df= pd.read_csv('/media/AnalysisTempDisk/HIV/03QC_data/all-QC-meta-0722-new.csv', index_col=0)
meta_df

meta_df = meta_df[~meta_df.index.isin(doubletList)]#

meta_df

# Create h5ad using SnapATAC 

## prepare metadata 

meta_df3 = meta_df[(meta_df['uniqueFrags'] > 1000) & (meta_df['uniqueFrags'] < 30000) 
                & (meta_df['MitoProportion'] < 0.2)
                & (meta_df['FRiP'] > 0.6) & (meta_df['TSS.enrichment'] > 5)&(meta_df['TSS.enrichment'] < 12)]
meta_df3.shape

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))
sns.violinplot(ax=axes[0], data=meta_df3['FRiP'])
axes[0].set_title('FRiP')
sns.violinplot(ax=axes[1], data=meta_df3['TSS.enrichment'])
axes[1].set_title('TSS.enrichment')
plt.tight_layout()
plt.show()

## create adatas h5ad

cellPass = meta_df3.index.tolist()

len(cellPass)

cellPass

data_dir = "/media/AnalysisTempDisk/001HIV/05_Fragments/722"
output_dir = "/media/AnalysisTempDisk/001HIV/05_Fragments/722-new"
os.makedirs(output_dir, exist_ok=True)
fragment_files = [f'{data_dir}/{fl}' for fl in os.listdir(data_dir) if fl.endswith(".tsv.gz")]

len(fragment_files)

outputs = []
for fl in fragment_files:
    name = fl.split('/')[-1].split('.tsv.gz')[0]
    outputs.append(f'{output_dir}/{name}.h5ad')
len(outputs)

sample_names=[f.split('/')[-1].split('.fragments.tsv.gz')[0] for f in fragment_files]
len(sample_names)

fragment_files

df = pd.DataFrame({'cellPass': cellPass})
df.to_csv('/media/AnalysisTempDisk/HIV/05_Fragments/722-new-18/cellPass.csv', index=False)

%%time
adatas = snap.pp.import_data(
    fragment_files,
    file=outputs,
    chrom_sizes=snap.genome.hg38,
    min_num_fragments=500,
    sorted_by_barcode=False,
    whitelist=cellPass
)

len(adatas)

### Add bin-matrix ###
snap.pp.add_tile_matrix(adatas, bin_size=5000)
snap.pp.select_features(adatas, n_features=50000)

# reloading adatas

data_dir='/media/AnalysisTempDisk/001HIV/05_Fragments/722-new'
h5ad_files = [f'{data_dir}/{fl}' for fl in os.listdir(data_dir) if fl.endswith(".h5ad")]

len(h5ad_files)

adatas=[snap.read(i) for i in h5ad_files]

len(adatas)

%%time
data = snap.AnnDataSet(
    adatas=[(f.filename.split('/')[-1].split('.h5ad')[0], f) for f in adatas],
    filename="/media/AnalysisTempDisk/HIV/output/304-new723.h5ads"
)
data

print(f'Number of cells: {data.n_obs}')
print(f'Number of unique barcodes: {np.unique(data.obs_names).size}')

# Dimension reduction

%%time
snap.pp.select_features(data, n_features=200000)
snap.tl.spectral(data)

# Batch correction

%%time
snap.pp.harmony(data, batch="sample", max_iter_harmony=20, key_added='X_spectral_harmony')

%%time
snap.tl.umap(data, use_rep="X_spectral_harmony")
snap.pp.knn(data, use_rep="X_spectral_harmony")
snap.tl.leiden(data)

snap.pl.umap(data, color="leiden", height=600, marker_opacity=1, interactive=False, show=True, scale=5)

snap.pl.umap(data, color="leiden", height=600, marker_size=1, interactive=False, show=True, scale=5)

snap.pl.umap(data, color="sample", width=800, height=600, marker_size=0.6, interactive=False, show=True, scale=1)

cross_df=pd.crosstab(data.obs['leiden'], data.obs['sample'])
cross_df.shape

from collections import Counter
Counter(data.obs['leiden'])

data

snap.pl.umap(data, color="leiden", height=600, marker_opacity=1, interactive=False, show=True, scale=5)

snap.pl.umap(data, color="leiden", height=600, marker_size=1, interactive=False, show=True, scale=5)

snap.pl.umap(data, color="sample", width=800, height=600, marker_size=0.6, interactive=False, show=True, scale=1)

cross_df=pd.crosstab(data.obs['leiden'], data.obs['sample'])
cross_df.shape

from collections import Counter
Counter(data.obs['leiden'])

# gene_matrix

%%time
gene_matrix = snap.pp.make_gene_matrix(data, snap.genome.hg38)
gene_matrix

%%time
sc.pp.filter_genes(gene_matrix, min_cells= 5)
sc.pp.normalize_total(gene_matrix)
sc.pp.log1p(gene_matrix)

# Copy over UMAP embedding
gene_matrix.obsm["X_umap"] = data.obsm["X_umap"]

sc.pl.umap(gene_matrix, color=["leiden"], legend_loc='on data')

gene_matrix.write("/media/AnalysisTempDisk/HIV/output/gene_matrix-normalize-304-0725.h5ad", compression='gzip')

# remove cluster

from collections import Counter
Counter(gene_matrix.obs['leiden'])

cross_df=pd.crosstab(gene_matrix.obs['leiden'], gene_matrix.obs['sample'])
cross_df.shape

import seaborn as sns
sns.clustermap(cross_df, standard_scale=0)

sc.pl.umap(gene_matrix, color=["leiden"], legend_loc='on data')

clusters = list(gene_matrix.obs['leiden'].cat.categories.values)
for cluster in clusters:
    sc.settings.set_figure_params(dpi=80, figsize=(5, 5),facecolor='white')
    sc.pl.umap(gene_matrix,color=['leiden'],groups=[cluster])

## remove_clusters

remove_clusters=['17', '22' ,'23']
#
gene_matrix=gene_matrix[~gene_matrix.obs['leiden'].isin(remove_clusters)]

sc.pl.umap(gene_matrix, color=["leiden"], legend_loc='on data')

# call peak

snap.pl.umap(data, color="leiden", height=600, marker_opacity=1, interactive=False, show=True, scale=5)

%%time
snap.tl.macs3(data, groupby='leiden',tempdir='/media/Disk1/call_peak_path/',n_jobs=8)

%%time
merged_peaks = snap.tl.merge_peaks(data.uns['macs3'], chrom_sizes=snap.genome.hg38)
merged_peaks.shape

%%time
peaks = snap.pp.make_peak_matrix(data, use_rep=merged_peaks['Peaks'])

peaks.obs['leiden'].value_counts() 

data.close()

peaks.write("/media/AnalysisTempDisk/HIV/output/atac-peaks-raw.h5ad", compression='gzip')