In [1]:
import scCAD
import numpy as np
import pandas as pd
import h5py
from collections import Counter
import os

In [2]:
# Data matrix should only consist of values where rows represent cells and columns represent genes.
data_mat = h5py.File('./1%Jurkat.h5')
data = np.array(data_mat['X']) # Cells * Genes
labels = np.array(data_mat['Y'])
geneNames = np.array(data_mat['gn'])
cellNames = np.array(data_mat['cn'])
data_mat.close()
data = np.vectorize(float)(data)
labels = np.array([str(i, 'UTF-8') for i in labels])
geneNames = np.array([str(i, 'UTF-8') for i in geneNames])
cellNames = np.array([str(i, 'UTF-8') for i in cellNames])

In [3]:
# If gene and cell names are not provided, scCAD will generate them automatically.
result, score, sub_clusters, degs_list = scCAD.scCAD(data=data, dataName='Jurkat', cellNames=cellNames, geneNames=geneNames, save_path='./Demo_res/') 
# Returned Value:
#    result : Rare sub-clusters identified by scCAD: list.
#    score : Score of every sub_clusters: np.array[n sub-clusters].
#    sub_clusters : Assignment of sub-cluster labels for each cell: np.array[n cells].
#    degs_list : List of differentially expressed genes used for rare sub-clusters: list.

>>> Data preprocessing in progress...
>>> After preprocessing, cells: 1556; genes: 15715;
>>> Feature selection in progress...
>>> Identification of HVGs is currently in progress...
>>> Identification of RFGs is currently in progress...
>>> After feature selection, genes: 3668;
>>> Clusters decomposition in progress...
>>> iter 1, running...
>>> After clusters decomposition, we got 119 balanced sub-clusters.
>>> Clusters merge in progress...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 26.09it/s]


>>> After clusters merge, we got 63 sub-clusters.
>>> Cluster anomaly score calculation in progress...


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:17<00:00,  3.52it/s]

>>> time used: 82.9166522026062





In [4]:
# If cell names are not provided, please run:
# cellNames = [str(i) for i in range(data.shape[0])]
for i in result:
    indices = np.where(np.isin(cellNames, i))[0]
    print(Counter(labels[indices]))

Counter({'jurkat': 16, '293T': 1})


In [5]:
folder_path = './Demo_res/'
file_list = os.listdir(folder_path)
# print output files
print("Files in the folder:")
for file_name in file_list:
    print(file_name)

Files in the folder:
Jurkat_scCAD_balanced_sub-clusters.txt
Jurkat_scCAD_comb_sub-clusters.txt
Jurkat_scCAD_degs_list.txt
Jurkat_scCAD_Init_clusters.txt
Jurkat_scCAD_Init_clusters_based_selected_genes.txt
Jurkat_scCAD_rare_cells_result.txt
Jurkat_scCAD_select_genes.txt
Jurkat_scCAD_sub-clusters_anomaly_score.txt


In [6]:
# 1. the clusters after decomposing (I-Clusters)
file_path = folder_path + 'Jurkat_scCAD_balanced_sub-clusters.txt'
I_clusters = []
with open(file_path, 'r') as file:
    for line in file:
        I_clusters.append(line.strip())

print("There are a total of %d balanced sub-clusters." %(len(Counter(I_clusters))))
print(Counter(I_clusters))

There are a total of 119 balanced sub-clusters.
Counter({'62': 28, '7': 24, '22': 24, '32': 23, '40': 23, '58': 23, '72': 23, '25': 22, '44': 22, '14': 21, '54': 21, '69': 21, '73': 21, '50': 20, '52': 20, '66': 20, '10': 19, '94': 19, '9': 18, '34': 18, '59': 18, '63': 18, '114': 18, '0': 17, '56': 17, '87': 17, '98': 17, '100': 17, '5': 16, '16': 16, '33': 16, '48': 16, '71': 16, '113': 16, '3': 15, '4': 15, '74': 15, '77': 15, '86': 15, '102': 15, '12': 14, '15': 14, '17': 14, '27': 14, '38': 14, '53': 14, '55': 14, '76': 14, '81': 14, '84': 14, '91': 14, '92': 14, '101': 14, '105': 14, '8': 13, '13': 13, '21': 13, '23': 13, '41': 13, '68': 13, '70': 13, '78': 13, '79': 13, '80': 13, '83': 13, '97': 13, '104': 13, '11': 12, '35': 12, '42': 12, '51': 12, '60': 12, '89': 12, '90': 12, '93': 12, '96': 12, '24': 11, '64': 11, '88': 11, '103': 11, '20': 10, '30': 10, '31': 10, '43': 10, '110': 10, '111': 10, '116': 10, '1': 9, '18': 9, '26': 9, '29': 9, '46': 9, '49': 9, '61': 9, '67': 9

In [7]:
# 2. the clusters after decomposing and merging (M-Clusters)
file_path = folder_path + 'Jurkat_scCAD_comb_sub-clusters.txt'
M_clusters = []
with open(file_path, 'r') as file:
    for line in file:
        M_clusters.append(line.strip())

print("There are a total of %d sub-clusters." %(len(Counter(M_clusters))))
print(Counter(M_clusters))

There are a total of 63 sub-clusters.
Counter({'5': 669, '8': 214, '39': 71, '4': 36, '0': 17, '9': 16, '58': 16, '3': 15, '10': 14, '16': 14, '23': 14, '30': 14, '45': 14, '46': 14, '7': 13, '14': 13, '37': 13, '38': 13, '41': 13, '49': 13, '29': 12, '32': 12, '43': 12, '44': 12, '48': 12, '15': 11, '34': 11, '13': 10, '19': 10, '20': 10, '25': 10, '55': 10, '56': 10, '60': 10, '1': 9, '11': 9, '18': 9, '28': 9, '33': 9, '36': 9, '47': 9, '22': 8, '35': 8, '40': 8, '50': 8, '52': 8, '2': 7, '17': 7, '54': 7, '24': 6, '57': 6, '12': 5, '21': 5, '31': 5, '42': 5, '51': 5, '59': 5, '26': 4, '27': 4, '61': 4, '62': 4, '6': 3, '53': 3})


In [8]:
# 3. the genes used to calculate the scores of rare clusters
file_path = folder_path + 'Jurkat_scCAD_degs_list.txt'
with open(file_path, 'r') as file:
    for line in file:
        print(line.strip())

CKB	TMSB4X	CD3D	XIST	ARHGDIB	MZB1	CA2	CDKN2A	ADA	HES4	ITM2A	CD1E	OCIAD2	EIF5A	LCK	RAC3	GAL	NUCB2	NMRAL1	LEF1


In [9]:
# 4. Jurkat_scCAD_Init_clusters.txt
file_path = folder_path + 'Jurkat_scCAD_Init_clusters.txt'
Init_clusters = []
with open(file_path, 'r') as file:
    for line in file:
        Init_clusters.append(line.strip())

print("There are a total of %d clusters in the first cluster stage." %(len(Counter(Init_clusters))))
print(Counter(Init_clusters))

There are a total of 10 clusters in the first cluster stage.
Counter({'6': 374, '1': 287, '2': 277, '4': 199, '5': 151, '9': 113, '3': 84, '8': 40, '0': 17, '7': 14})


In [10]:
# 5. Jurkat_scCAD_Init_clusters_based_selected_genes.txt
file_path = folder_path + 'Jurkat_scCAD_Init_clusters_based_selected_genes.txt'
Init_clusters = []
with open(file_path, 'r') as file:
    for line in file:
        Init_clusters.append(line.strip())

print("There are a total of %d clusters in the first cluster stage based on the selected genes." %(len(Counter(Init_clusters))))
print(Counter(Init_clusters))

There are a total of 8 clusters in the first cluster stage based on the selected genes.
Counter({'1': 355, '5': 281, '7': 274, '4': 182, '3': 172, '2': 166, '6': 109, '0': 17})


In [11]:
# 6. The index of predicted rare cells.
file_path = folder_path + 'Jurkat_scCAD_rare_cells_result.txt'
with open(file_path, 'r') as file:
    for line in file:
        print(line.strip())

ACTCCCGAGAAACA-1	GTTCAACTAGCTCA-1	GTGGAGGACACAAC-1	GATGCATGGCCCTT-1	TGACGATGCTACGA-1	GAATGGCTAGGAGC-1	AATCTAGAAGTCTG-1	ATGGGTACGTTGAC-1	ATTGCGGAGTCTGA-1	TACCATTGGGTACT-1	CGGCATCTCTTACT-1	ATGAAGGAACTACG-1	AGTGACACTGTGGT-1	CATTGTACGCCCTT-1	TGCACAGATCGTGA-1	CGTTATACGAGGAC-1	AGCATCGAACTCAG-1


In [12]:
# 7. The genes selected by scCAD.
file_path = folder_path + 'Jurkat_scCAD_select_genes.txt'
with open(file_path, 'r') as file:
    for line in file:
        print(line.strip())

SLFN5
MTF2
POLQ
HNRNPH3
TMCO1
NFIX
SLC25A3
KIAA1841
CMSS1
SURF4
UBE2D1
RNF114
CHMP2A
MT-CO2
PFDN1
PRELID1
MAP7D3
PTPRE
EMG1
UTP11L
SF3B4
TGFBR1
SNHG10
EIF2S2
LAX1
TOPORS
N4BP2
CDC20
CHCHD10
HSPD1
MAGEB2
C1orf35
TBC1D7
C9orf78
BTBD8
SUCLA2-AS1
CLUL1
IARS
JPH3
ATF3
RP1-40E16.12
SLIT2
HNRNPA3
UAP1
BROX
EZH2
GGCT
ARF1
NR5A2
COX18
AKAP5
COPS3
STOX1
AK7
CTD-2044J15.2
TUBB
MESP1
SF3B5
MKRN2
HTATIP2
MLX
CTD-2006H14.2
NXF1
HSPA8
MRPL52
BCAS3
CHMP5
ARPC1B
ZNF212
PCDH9
ATP1B3
PMPCB
ACOT7
TPX2
METTL14
SQLE
CTD-2162K18.4
NEDD8
ISCA1
DAPK2
TIGD3
AEBP2
GTF2IRD2B
C3orf33
APOA1BP
CWC15
SLC25A5
MCPH1
ATP1B1
RPS2
NEMF
MINOS1
C11orf83
RBM39
CTD-2012J19.3
LMCD1
SWI5
ALDH1A2
RP11-66N24.3
USB1
BICD2
SRSF1
RPS14
MRPS15
SMC2
FAM136A
BCAS1
C1orf174
FAM167B
DGKG
NOL7
RMI1
NCOA3
AKIRIN1
TMEM14B
SRPRB
EIF4EBP1
FAM160A2
ITGB1BP1
MLEC
ATP5E
RP11-705O1.8
PSMB4
GALNT12
FBXL4
MZT1
FAM49B
TPM3
NEFM
PSMB7
PXMP2
HIST1H4H
NDUFS3
RPL35
PDIA4
WDR73
HNRNPR
LYPD1
S100A11
DTX3L
SNW1
COX7C
RBX1
BRCA2
PSMD12
ALKBH5
ZNF250
TRIAP1


In [13]:
# 8. The scores calculated by scCAD for each cluster.
file_path = folder_path + 'Jurkat_scCAD_sub-clusters_anomaly_score.txt'
with open(file_path, 'r') as file:
    for line in file:
        print(line.strip())

0.941176
0.000000
0.000000
0.066667
0.000000
0.000000
0.000000
0.000000
0.000000
0.250000
0.214286
0.000000
0.000000
0.400000
0.153846
0.090909
0.071429
0.000000
0.000000
0.300000
0.000000
0.000000
0.000000
0.000000
0.000000
0.200000
0.000000
0.000000
0.000000
0.416667
0.142857
0.000000
0.166667
0.000000
0.090909
0.000000
0.000000
0.153846
0.000000
0.000000
0.000000
0.230769
0.000000
0.333333
0.333333
0.214286
0.142857
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.100000
0.500000
0.000000
0.500000
0.000000
0.200000
0.000000
0.000000
