In [1]:
import pandas as pd
import numpy as np
import scanpy as sc

In [2]:
df = pd.read_csv('all_counts.txt', sep='\t', header=1, index_col=0)
# df = pd.read_csv('all_Multi-overlapping_counts.txt', sep='\t', header=1, index_col=0)

In [3]:
df.columns = df.columns.str.replace('/work/yama/singlecell/data/STAR_results/', '', regex=True)
df.columns = df.columns.str.replace('_Aligned.sortedByCoord.out.bam','', regex=True)

In [4]:
df[:2]

Unnamed: 0_level_0,Chr,Start,End,Strand,Length,GSM1433164,GSM1433165,GSM1433166,GSM1433167,GSM1433168,...,GSM1642230,GSM1642231,GSM1642232,GSM1642233,GSM1642234,GSM1642235,GSM1642236,GSM1642237,GSM1642238,GSM1642239
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Xkr4,chr1;chr1;chr1,3214482;3421702;3670552,3216968;3421901;3671498,-;-;-,3634,0,0,0,0,0,...,0,0,0,0,0,0,0,11,0,0
Rp1,chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1,4290846;4343507;4351910;4351910;4352202;435220...,4293012;4350091;4352081;4352081;4352837;435283...,-;-;-;-;-;-;-;-,9747,1,0,0,5,0,...,0,0,0,2,0,0,0,1,0,3


In [5]:
gene_length = df['Length']
gene_length

Geneid
Xkr4       3634
Rp1        9747
Sox17      4095
Mrpl15     4201
Lypla1     2433
           ... 
Gm20816    1222
Gm20867    1220
Gm20806    3996
Gm20854    1984
Erdr1       777
Name: Length, Length: 24421, dtype: int64

In [6]:
gene_cell_matrix = df.drop(['Chr', 'Start','End', 'Strand','Length'], axis=1)

In [7]:
gene_cell_matrix

Unnamed: 0_level_0,GSM1433164,GSM1433165,GSM1433166,GSM1433167,GSM1433168,GSM1433169,GSM1433170,GSM1433171,GSM1433172,GSM1433173,...,GSM1642230,GSM1642231,GSM1642232,GSM1642233,GSM1642234,GSM1642235,GSM1642236,GSM1642237,GSM1642238,GSM1642239
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Xkr4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,11,0,0
Rp1,1,0,0,5,0,0,0,62,0,0,...,0,0,0,2,0,0,0,1,0,3
Sox17,0,3,0,1,0,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,2
Mrpl15,45,1,1,5,127,0,1,235,94,76,...,91,0,1,1,0,0,0,28,0,355
Lypla1,1,1,1,118,0,318,0,6,1,0,...,12,0,0,64,50,2,0,6,0,135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gm20816,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm20867,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm20806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm20854,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
read_counts = gene_cell_matrix.sum(axis=0)
read_counts

GSM1433164     492975
GSM1433165     566279
GSM1433166     326142
GSM1433167     359673
GSM1433168     611321
               ...   
GSM1642235     174897
GSM1642236     101714
GSM1642237      63962
GSM1642238      62491
GSM1642239    2783947
Length: 2128, dtype: int64

In [9]:
def normalize_per_million_reads(df):
    # RPM/FPM = reads per million  fragments per million
    sum_count = df.sum()
    return 10**6 * df / sum_count

def normalize_per_kilobase(df, gene_length):
    # FPKM = fragments per kilobase of exon per million reads mapped
    df_tmp = df.copy()
    df_tmp = (df.T * 10**3 / gene_length).T
    return df_tmp

def normalize_tpm(df, gene_length):
    # TPM = transcripts per kilobase million   https://bi.biopapyrus.jp/ 
    df_tmp = df.copy()
    df_tmp = normalize_per_kilobase(df_tmp, gene_length)
    df_tmp = normalize_per_million_reads(df_tmp)
    return df_tmp


In [10]:
df_tpm = normalize_tpm(gene_cell_matrix, gene_length)

In [11]:
df_tpm

Unnamed: 0_level_0,GSM1433164,GSM1433165,GSM1433166,GSM1433167,GSM1433168,GSM1433169,GSM1433170,GSM1433171,GSM1433172,GSM1433173,...,GSM1642230,GSM1642231,GSM1642232,GSM1642233,GSM1642234,GSM1642235,GSM1642236,GSM1642237,GSM1642238,GSM1642239
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Xkr4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,118.092232,0.0,0.000000
Rp1,0.539892,0.000000,0.000000,3.572413,0.000000,0.000000,0.000000,36.215083,0.000000,0.000000,...,0.000000,0.000000,0.000000,2.529124,0.000000,0.000000,0.0,4.002604,0.0,0.275530
Sox17,0.000000,3.115678,0.000000,1.700626,0.000000,0.000000,2.617732,1.390320,0.000000,0.000000,...,0.000000,873.844096,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.437214
Mrpl15,56.368645,1.012354,2.009364,8.288578,133.548822,0.000000,2.551681,318.481305,96.934832,83.415424,...,176.516387,0.000000,17.212876,2.933988,0.000000,0.000000,0.0,260.027283,0.0,75.647381
Lypla1,2.162896,1.748006,3.469518,337.755616,0.000000,465.958252,0.000000,14.040349,1.780584,0.000000,...,40.191615,0.000000,0.000000,324.226773,483.486022,10.663189,0.0,96.210553,0.0,49.671798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gm20816,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
Gm20867,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
Gm20806,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
Gm20854,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000


In [12]:
df_log2_tpm = np.log2(df_tpm+1)

In [13]:
df_log2_tpm

Unnamed: 0_level_0,GSM1433164,GSM1433165,GSM1433166,GSM1433167,GSM1433168,GSM1433169,GSM1433170,GSM1433171,GSM1433172,GSM1433173,...,GSM1642230,GSM1642231,GSM1642232,GSM1642233,GSM1642234,GSM1642235,GSM1642236,GSM1642237,GSM1642238,GSM1642239
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Xkr4,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,6.895936,0.0,0.000000
Rp1,0.622829,0.000000,0.000000,2.192956,0.000000,0.00000,0.000000,5.217816,0.000000,0.000000,...,0.000000,0.000000,0.000000,1.819310,0.000000,0.00000,0.0,2.322679,0.0,0.351097
Sox17,0.000000,2.041130,0.000000,1.433294,0.000000,0.00000,1.855086,1.257204,0.000000,0.000000,...,0.000000,9.772882,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.523275
Mrpl15,5.842191,1.008884,1.589459,3.215458,7.071986,0.00000,1.828502,8.319588,6.613750,6.399435,...,7.471808,0.000000,4.186887,1.975993,0.000000,0.00000,0.0,8.028057,0.0,6.260165
Lypla1,1.661246,1.458385,2.160119,8.404101,0.000000,8.86715,0.000000,3.910766,1.475388,0.000000,...,5.364279,0.000000,0.000000,8.345302,8.920311,3.54389,0.0,6.603041,0.0,5.663111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gm20816,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Gm20867,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Gm20806,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Gm20854,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000


In [14]:
# Qc1 filtered out cells with less than 2500 genes with log2(TPM + 1) > 2
qc1_df_log2_tpm_check = df_log2_tpm[df_log2_tpm > 2].count()
qc1_df_log2_tpm_check

GSM1433164    2363
GSM1433165    3511
GSM1433166    3122
GSM1433167    4606
GSM1433168    2061
              ... 
GSM1642235    4663
GSM1642236    5048
GSM1642237    5402
GSM1642238    5814
GSM1642239    9290
Length: 2128, dtype: int64

In [15]:
qc1_df_log2_tpm = df_log2_tpm.drop(qc1_df_log2_tpm_check[qc1_df_log2_tpm_check < 2500].index, axis=1)
qc1_df_log2_tpm

Unnamed: 0_level_0,GSM1433165,GSM1433166,GSM1433167,GSM1433169,GSM1433170,GSM1433171,GSM1433172,GSM1433173,GSM1433174,GSM1433175,...,GSM1642229,GSM1642230,GSM1642232,GSM1642233,GSM1642234,GSM1642235,GSM1642236,GSM1642237,GSM1642238,GSM1642239
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Xkr4,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,6.895936,0.0,0.000000
Rp1,0.000000,0.000000,2.192956,0.00000,0.000000,5.217816,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,1.819310,0.000000,0.00000,0.0,2.322679,0.0,0.351097
Sox17,2.041130,0.000000,1.433294,0.00000,1.855086,1.257204,0.000000,0.000000,0.00000,0.973099,...,3.985923,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.523275
Mrpl15,1.008884,1.589459,3.215458,0.00000,1.828502,8.319588,6.613750,6.399435,6.74431,0.000000,...,3.951409,7.471808,4.186887,1.975993,0.000000,0.00000,0.0,8.028057,0.0,6.260165
Lypla1,1.458385,2.160119,8.404101,8.86715,0.000000,3.910766,1.475388,0.000000,0.00000,0.000000,...,8.107669,5.364279,0.000000,8.345302,8.920311,3.54389,0.0,6.603041,0.0,5.663111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gm20816,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Gm20867,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Gm20806,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Gm20854,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000


In [16]:
# qc2 excluded genes whose log2(TPM + 1) < 4 in the aggregated data for each of the six populations
pre_cell_info = pd.read_csv('SraRunTable.csv')
pre_cell_info

Unnamed: 0,Run,Age,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Cell_type,Center Name,...,LibraryLayout,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,Sample Name,source_name,SRA Study,strain
0,SRR1565897,2-3 months old,RNA-Seq,50,785261650,PRJNA254994,SAMN03020604,543654584,long term hematopoietic stem cell,GEO,...,PAIRED,cDNA,TRANSCRIPTOMIC,Mus musculus,ILLUMINA,2015-09-30T00:00:00Z,GSM1498874,hematopoietic stem and progenitor cells,SRP044256,C57BL/6
1,SRR1565898,2-3 months old,RNA-Seq,50,761740300,PRJNA254994,SAMN03020605,526552902,long term hematopoietic stem cell,GEO,...,PAIRED,cDNA,TRANSCRIPTOMIC,Mus musculus,ILLUMINA,2015-09-30T00:00:00Z,GSM1498875,hematopoietic stem and progenitor cells,SRP044256,C57BL/6
2,SRR1565899,2-3 months old,RNA-Seq,50,732633600,PRJNA254994,SAMN03020606,505947122,long term hematopoietic stem cell,GEO,...,PAIRED,cDNA,TRANSCRIPTOMIC,Mus musculus,ILLUMINA,2015-09-30T00:00:00Z,GSM1498876,hematopoietic stem and progenitor cells,SRP044256,C57BL/6
3,SRR1565900,2-3 months old,RNA-Seq,50,662130100,PRJNA254994,SAMN03020607,461348264,short term hematopoietic stem cell,GEO,...,PAIRED,cDNA,TRANSCRIPTOMIC,Mus musculus,ILLUMINA,2015-09-30T00:00:00Z,GSM1498877,hematopoietic stem and progenitor cells,SRP044256,C57BL/6
4,SRR1565901,2-3 months old,RNA-Seq,50,844021800,PRJNA254994,SAMN03020608,584680086,short term hematopoietic stem cell,GEO,...,PAIRED,cDNA,TRANSCRIPTOMIC,Mus musculus,ILLUMINA,2015-09-30T00:00:00Z,GSM1498878,hematopoietic stem and progenitor cells,SRP044256,C57BL/6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2123,SRR1926995,2-3 months old,RNA-Seq,50,29058400,PRJNA254994,SAMN03444315,21522975,short term hematopoietic stem cell,GEO,...,PAIRED,cDNA,TRANSCRIPTOMIC,Mus musculus,ILLUMINA,2015-09-30T00:00:00Z,GSM1642235,hematopoietic stem and progenitor cells,SRP044256,DBA/2
2124,SRR1926996,2-3 months old,RNA-Seq,50,24065300,PRJNA254994,SAMN03444236,17854299,short term hematopoietic stem cell,GEO,...,PAIRED,cDNA,TRANSCRIPTOMIC,Mus musculus,ILLUMINA,2015-09-30T00:00:00Z,GSM1642236,hematopoietic stem and progenitor cells,SRP044256,DBA/2
2125,SRR1926997,2-3 months old,RNA-Seq,50,22355750,PRJNA254994,SAMN03444219,16601793,short term hematopoietic stem cell,GEO,...,PAIRED,cDNA,TRANSCRIPTOMIC,Mus musculus,ILLUMINA,2015-09-30T00:00:00Z,GSM1642237,hematopoietic stem and progenitor cells,SRP044256,DBA/2
2126,SRR1926998,2-3 months old,RNA-Seq,50,18061800,PRJNA254994,SAMN03444335,13398892,short term hematopoietic stem cell,GEO,...,PAIRED,cDNA,TRANSCRIPTOMIC,Mus musculus,ILLUMINA,2015-09-30T00:00:00Z,GSM1642238,hematopoietic stem and progenitor cells,SRP044256,DBA/2


In [17]:
cell_info = pre_cell_info.loc[:,['Age', 'Cell_type', 'Sample Name', 'strain']]
cell_info

Unnamed: 0,Age,Cell_type,Sample Name,strain
0,2-3 months old,long term hematopoietic stem cell,GSM1498874,C57BL/6
1,2-3 months old,long term hematopoietic stem cell,GSM1498875,C57BL/6
2,2-3 months old,long term hematopoietic stem cell,GSM1498876,C57BL/6
3,2-3 months old,short term hematopoietic stem cell,GSM1498877,C57BL/6
4,2-3 months old,short term hematopoietic stem cell,GSM1498878,C57BL/6
...,...,...,...,...
2123,2-3 months old,short term hematopoietic stem cell,GSM1642235,DBA/2
2124,2-3 months old,short term hematopoietic stem cell,GSM1642236,DBA/2
2125,2-3 months old,short term hematopoietic stem cell,GSM1642237,DBA/2
2126,2-3 months old,short term hematopoietic stem cell,GSM1642238,DBA/2


In [18]:
cell_info = cell_info.set_index('Sample Name')
cell_info

Unnamed: 0_level_0,Age,Cell_type,strain
Sample Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GSM1498874,2-3 months old,long term hematopoietic stem cell,C57BL/6
GSM1498875,2-3 months old,long term hematopoietic stem cell,C57BL/6
GSM1498876,2-3 months old,long term hematopoietic stem cell,C57BL/6
GSM1498877,2-3 months old,short term hematopoietic stem cell,C57BL/6
GSM1498878,2-3 months old,short term hematopoietic stem cell,C57BL/6
...,...,...,...
GSM1642235,2-3 months old,short term hematopoietic stem cell,DBA/2
GSM1642236,2-3 months old,short term hematopoietic stem cell,DBA/2
GSM1642237,2-3 months old,short term hematopoietic stem cell,DBA/2
GSM1642238,2-3 months old,short term hematopoietic stem cell,DBA/2


In [19]:
qc1_df_log2_tpm_gene_type = qc1_df_log2_tpm[qc1_df_log2_tpm > 2].count()

In [20]:
label_data = pd.DataFrame([[X, cell_info.loc[X, 'Age'], cell_info.loc[X, 'Cell_type'], cell_info.loc[X, 'strain'], qc1_df_log2_tpm_gene_type[X], read_counts[X]] for X in qc1_df_log2_tpm.columns], columns=['id','Age', 'Cell_type', 'strain', 'gene_type', 'read_counts'])
label_data

Unnamed: 0,id,Age,Cell_type,strain,gene_type,read_counts
0,GSM1433165,2-3 months old,long term hematopoietic stem cell,C57BL/6,3511,566279
1,GSM1433166,2-3 months old,long term hematopoietic stem cell,C57BL/6,3122,326142
2,GSM1433167,2-3 months old,long term hematopoietic stem cell,C57BL/6,4606,359673
3,GSM1433169,2-3 months old,long term hematopoietic stem cell,C57BL/6,3154,726849
4,GSM1433170,2-3 months old,long term hematopoietic stem cell,C57BL/6,4291,239890
...,...,...,...,...,...,...
1975,GSM1642235,2-3 months old,short term hematopoietic stem cell,DBA/2,4663,174897
1976,GSM1642236,2-3 months old,short term hematopoietic stem cell,DBA/2,5048,101714
1977,GSM1642237,2-3 months old,short term hematopoietic stem cell,DBA/2,5402,63962
1978,GSM1642238,2-3 months old,short term hematopoietic stem cell,DBA/2,5814,62491


In [21]:
label_data['Age'].value_counts() 

# 22 months old → old
# 2-3 months old → young
# 20 months old → old

22 months old     914
2-3 months old    801
20 months old     265
Name: Age, dtype: int64

In [22]:
label_data['Cell_type'].value_counts()

# long term hematopoietic stem cell → LT-HSC
# short term hematopoietic stem cell  → ST-HSC
# multipotent progenitor → MP

long term hematopoietic stem cell     768
short term hematopoietic stem cell    678
multipotent progenitor                534
Name: Cell_type, dtype: int64

In [23]:
label_data['strain'].value_counts() 

C57BL/6    1468
DBA/2       512
Name: strain, dtype: int64

In [24]:
label_data['y_o'] = label_data['Age'].replace('2-3 months old','young').replace('22 months old','old').replace('20 months old','old')
label_data['c_type'] = label_data['Cell_type'].replace(
                        'long term hematopoietic stem cell', 'LT-HSC').replace(
                        'short term hematopoietic stem cell', 'ST-HSC').replace(
                        'multipotent progenitor', 'MPP')
                                                                                                              
label_data

Unnamed: 0,id,Age,Cell_type,strain,gene_type,read_counts,y_o,c_type
0,GSM1433165,2-3 months old,long term hematopoietic stem cell,C57BL/6,3511,566279,young,LT-HSC
1,GSM1433166,2-3 months old,long term hematopoietic stem cell,C57BL/6,3122,326142,young,LT-HSC
2,GSM1433167,2-3 months old,long term hematopoietic stem cell,C57BL/6,4606,359673,young,LT-HSC
3,GSM1433169,2-3 months old,long term hematopoietic stem cell,C57BL/6,3154,726849,young,LT-HSC
4,GSM1433170,2-3 months old,long term hematopoietic stem cell,C57BL/6,4291,239890,young,LT-HSC
...,...,...,...,...,...,...,...,...
1975,GSM1642235,2-3 months old,short term hematopoietic stem cell,DBA/2,4663,174897,young,ST-HSC
1976,GSM1642236,2-3 months old,short term hematopoietic stem cell,DBA/2,5048,101714,young,ST-HSC
1977,GSM1642237,2-3 months old,short term hematopoietic stem cell,DBA/2,5402,63962,young,ST-HSC
1978,GSM1642238,2-3 months old,short term hematopoietic stem cell,DBA/2,5814,62491,young,ST-HSC


In [25]:
qc1_young = qc1_df_log2_tpm.loc[:, label_data[label_data['y_o'] == 'young'].id.to_list()]
qc1_old = qc1_df_log2_tpm.loc[:, label_data[label_data['y_o'] == 'old'].id.to_list()]

qc1_young

Unnamed: 0_level_0,GSM1433165,GSM1433166,GSM1433167,GSM1433169,GSM1433170,GSM1433171,GSM1433172,GSM1433173,GSM1433174,GSM1433175,...,GSM1642229,GSM1642230,GSM1642232,GSM1642233,GSM1642234,GSM1642235,GSM1642236,GSM1642237,GSM1642238,GSM1642239
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Xkr4,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,6.895936,0.0,0.000000
Rp1,0.000000,0.000000,2.192956,0.00000,0.000000,5.217816,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,1.819310,0.000000,0.00000,0.0,2.322679,0.0,0.351097
Sox17,2.041130,0.000000,1.433294,0.00000,1.855086,1.257204,0.000000,0.000000,0.00000,0.973099,...,3.985923,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.523275
Mrpl15,1.008884,1.589459,3.215458,0.00000,1.828502,8.319588,6.613750,6.399435,6.74431,0.000000,...,3.951409,7.471808,4.186887,1.975993,0.000000,0.00000,0.0,8.028057,0.0,6.260165
Lypla1,1.458385,2.160119,8.404101,8.86715,0.000000,3.910766,1.475388,0.000000,0.00000,0.000000,...,8.107669,5.364279,0.000000,8.345302,8.920311,3.54389,0.0,6.603041,0.0,5.663111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gm20816,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Gm20867,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Gm20806,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Gm20854,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000


In [26]:
def get_target_list(data, label_data, y_o, target_name, type_name):
    l_data = label_data[label_data['y_o'] == y_o]
    t_data = data.loc[:, l_data[l_data[target_name] == type_name].id.to_list()]

    gene_list = t_data.sum(axis=1)
    return gene_list[gene_list < 4].index.to_list()

In [27]:
qc1_young_LT = get_target_list(qc1_young, label_data, 'young', 'c_type', 'LT-HSC')
qc1_young_ST = get_target_list(qc1_young, label_data, 'young','c_type', 'ST-HSC')
qc1_young_MP = get_target_list(qc1_young, label_data, 'young','c_type', 'MP')

qc2_old_LT = get_target_list(qc1_old, label_data, 'old','c_type', 'LT-HSC')
qc2_old_ST = get_target_list(qc1_old, label_data, 'old','c_type', 'ST-HSC')
qc2_old_MP = get_target_list(qc1_old, label_data, 'old','c_type', 'MP')

In [28]:
qc2_exclude_list = list(set(qc1_young_LT)&set(qc1_young_ST)&set(qc1_young_MP)&set(qc2_old_LT)&set(qc2_old_ST)&set(qc2_old_MP))
qc2_exclude_list[:10]

['Gm4303',
 'Sly',
 'Pira4',
 'Vmn1r132',
 'Gm11710',
 'Gm3701',
 'Gm10487',
 'Zfp91Cntf',
 'Gm20823',
 'Mup1']

In [29]:
qc2_df_log2_tpm = qc1_df_log2_tpm.drop(qc2_exclude_list, axis=0)
qc2_df_log2_tpm

Unnamed: 0_level_0,GSM1433165,GSM1433166,GSM1433167,GSM1433169,GSM1433170,GSM1433171,GSM1433172,GSM1433173,GSM1433174,GSM1433175,...,GSM1642229,GSM1642230,GSM1642232,GSM1642233,GSM1642234,GSM1642235,GSM1642236,GSM1642237,GSM1642238,GSM1642239
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Xkr4,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,6.895936,0.0,0.000000
Rp1,0.000000,0.000000,2.192956,0.00000,0.000000,5.217816,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,1.819310,0.000000,0.00000,0.0,2.322679,0.0,0.351097
Sox17,2.041130,0.000000,1.433294,0.00000,1.855086,1.257204,0.000000,0.000000,0.000000,0.973099,...,3.985923,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.523275
Mrpl15,1.008884,1.589459,3.215458,0.00000,1.828502,8.319588,6.613750,6.399435,6.744310,0.000000,...,3.951409,7.471808,4.186887,1.975993,0.000000,0.00000,0.0,8.028057,0.0,6.260165
Lypla1,1.458385,2.160119,8.404101,8.86715,0.000000,3.910766,1.475388,0.000000,0.000000,0.000000,...,8.107669,5.364279,0.000000,8.345302,8.920311,3.54389,0.0,6.603041,0.0,5.663111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ddx3y,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.951331,0.995469,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Usp9y,0.609224,1.030628,0.000000,0.00000,0.000000,0.000000,0.618456,0.000000,0.427112,0.572635,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.288299
Zfy2,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Sry,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000


In [30]:
# qc3, centered the data by subtracting for each gene its average expression [log2(TPM + 1)] 
qc2_df_log2_tpm_mean = qc2_df_log2_tpm.mean(axis=1)
qc2_df_log2_tpm_mean

Geneid
Xkr4      0.267355
Rp1       0.277023
Sox17     0.431912
Mrpl15    3.521763
Lypla1    3.748191
            ...   
Ddx3y     0.141538
Usp9y     0.164080
Zfy2      0.047394
Sry       0.061879
Erdr1     1.947615
Length: 23726, dtype: float64

In [31]:
qc2_df_log2_tpm

Unnamed: 0_level_0,GSM1433165,GSM1433166,GSM1433167,GSM1433169,GSM1433170,GSM1433171,GSM1433172,GSM1433173,GSM1433174,GSM1433175,...,GSM1642229,GSM1642230,GSM1642232,GSM1642233,GSM1642234,GSM1642235,GSM1642236,GSM1642237,GSM1642238,GSM1642239
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Xkr4,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,6.895936,0.0,0.000000
Rp1,0.000000,0.000000,2.192956,0.00000,0.000000,5.217816,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,1.819310,0.000000,0.00000,0.0,2.322679,0.0,0.351097
Sox17,2.041130,0.000000,1.433294,0.00000,1.855086,1.257204,0.000000,0.000000,0.000000,0.973099,...,3.985923,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.523275
Mrpl15,1.008884,1.589459,3.215458,0.00000,1.828502,8.319588,6.613750,6.399435,6.744310,0.000000,...,3.951409,7.471808,4.186887,1.975993,0.000000,0.00000,0.0,8.028057,0.0,6.260165
Lypla1,1.458385,2.160119,8.404101,8.86715,0.000000,3.910766,1.475388,0.000000,0.000000,0.000000,...,8.107669,5.364279,0.000000,8.345302,8.920311,3.54389,0.0,6.603041,0.0,5.663111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ddx3y,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.951331,0.995469,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Usp9y,0.609224,1.030628,0.000000,0.00000,0.000000,0.000000,0.618456,0.000000,0.427112,0.572635,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.288299
Zfy2,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000
Sry,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.0,0.000000


In [32]:
pre_qc3_df_log2_tpm = qc2_df_log2_tpm.copy()
qc3_df_log2_tpm =pre_qc3_df_log2_tpm.add(-1*qc2_df_log2_tpm_mean, axis=0)
qc3_df_log2_tpm

Unnamed: 0_level_0,GSM1433165,GSM1433166,GSM1433167,GSM1433169,GSM1433170,GSM1433171,GSM1433172,GSM1433173,GSM1433174,GSM1433175,...,GSM1642229,GSM1642230,GSM1642232,GSM1642233,GSM1642234,GSM1642235,GSM1642236,GSM1642237,GSM1642238,GSM1642239
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Xkr4,-0.267355,-0.267355,-0.267355,-0.267355,-0.267355,-0.267355,-0.267355,-0.267355,-0.267355,-0.267355,...,-0.267355,-0.267355,-0.267355,-0.267355,-0.267355,-0.267355,-0.267355,6.628581,-0.267355,-0.267355
Rp1,-0.277023,-0.277023,1.915933,-0.277023,-0.277023,4.940792,-0.277023,-0.277023,-0.277023,-0.277023,...,-0.277023,-0.277023,-0.277023,1.542287,-0.277023,-0.277023,-0.277023,2.045656,-0.277023,0.074073
Sox17,1.609218,-0.431912,1.001381,-0.431912,1.423173,0.825292,-0.431912,-0.431912,-0.431912,0.541187,...,3.554011,-0.431912,-0.431912,-0.431912,-0.431912,-0.431912,-0.431912,-0.431912,-0.431912,0.091363
Mrpl15,-2.512879,-1.932305,-0.306306,-3.521763,-1.693261,4.797824,3.091987,2.877672,3.222547,-3.521763,...,0.429646,3.950045,0.665124,-1.545771,-3.521763,-3.521763,-3.521763,4.506294,-3.521763,2.738401
Lypla1,-2.289806,-1.588072,4.655910,5.118959,-3.748191,0.162575,-2.272803,-3.748191,-3.748191,-3.748191,...,4.359478,1.616088,-3.748191,4.597111,5.172120,-0.204301,-3.748191,2.854850,-3.748191,1.914920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ddx3y,-0.141538,-0.141538,-0.141538,-0.141538,-0.141538,-0.141538,0.809793,0.853931,-0.141538,-0.141538,...,-0.141538,-0.141538,-0.141538,-0.141538,-0.141538,-0.141538,-0.141538,-0.141538,-0.141538,-0.141538
Usp9y,0.445144,0.866548,-0.164080,-0.164080,-0.164080,-0.164080,0.454376,-0.164080,0.263032,0.408555,...,-0.164080,-0.164080,-0.164080,-0.164080,-0.164080,-0.164080,-0.164080,-0.164080,-0.164080,0.124219
Zfy2,-0.047394,-0.047394,-0.047394,-0.047394,-0.047394,-0.047394,-0.047394,-0.047394,-0.047394,-0.047394,...,-0.047394,-0.047394,-0.047394,-0.047394,-0.047394,-0.047394,-0.047394,-0.047394,-0.047394,-0.047394
Sry,-0.061879,-0.061879,-0.061879,-0.061879,-0.061879,-0.061879,-0.061879,-0.061879,-0.061879,-0.061879,...,-0.061879,-0.061879,-0.061879,-0.061879,-0.061879,-0.061879,-0.061879,-0.061879,-0.061879,-0.061879


In [33]:
qc3_df_log2_tpm.T.to_csv('data/gene_cell_matrix_log2_tpm_after_qc.csv')

In [34]:
adata = sc.read_csv('data/gene_cell_matrix_log2_tpm_after_qc.csv')

In [35]:
adata

AnnData object with n_obs × n_vars = 1980 × 23726

In [36]:
label_data

Unnamed: 0,id,Age,Cell_type,strain,gene_type,read_counts,y_o,c_type
0,GSM1433165,2-3 months old,long term hematopoietic stem cell,C57BL/6,3511,566279,young,LT-HSC
1,GSM1433166,2-3 months old,long term hematopoietic stem cell,C57BL/6,3122,326142,young,LT-HSC
2,GSM1433167,2-3 months old,long term hematopoietic stem cell,C57BL/6,4606,359673,young,LT-HSC
3,GSM1433169,2-3 months old,long term hematopoietic stem cell,C57BL/6,3154,726849,young,LT-HSC
4,GSM1433170,2-3 months old,long term hematopoietic stem cell,C57BL/6,4291,239890,young,LT-HSC
...,...,...,...,...,...,...,...,...
1975,GSM1642235,2-3 months old,short term hematopoietic stem cell,DBA/2,4663,174897,young,ST-HSC
1976,GSM1642236,2-3 months old,short term hematopoietic stem cell,DBA/2,5048,101714,young,ST-HSC
1977,GSM1642237,2-3 months old,short term hematopoietic stem cell,DBA/2,5402,63962,young,ST-HSC
1978,GSM1642238,2-3 months old,short term hematopoietic stem cell,DBA/2,5814,62491,young,ST-HSC


In [37]:
adata.obs['id'] = label_data['id'].to_list()
adata.obs['Age'] = label_data['Age'].to_list()
adata.obs['Cell_type'] = label_data['Cell_type'].to_list()
adata.obs['strain'] = label_data['strain'].to_list()
adata.obs['y_o'] = label_data['y_o'].to_list()
adata.obs['c_type'] = label_data['c_type'].to_list()
adata.obs['gene_type'] = label_data['gene_type'].to_list()
adata.obs['read_counts'] = label_data['read_counts'].to_list()

adata.obs

Unnamed: 0,id,Age,Cell_type,strain,y_o,c_type,gene_type,read_counts
GSM1433165,GSM1433165,2-3 months old,long term hematopoietic stem cell,C57BL/6,young,LT-HSC,3511,566279
GSM1433166,GSM1433166,2-3 months old,long term hematopoietic stem cell,C57BL/6,young,LT-HSC,3122,326142
GSM1433167,GSM1433167,2-3 months old,long term hematopoietic stem cell,C57BL/6,young,LT-HSC,4606,359673
GSM1433169,GSM1433169,2-3 months old,long term hematopoietic stem cell,C57BL/6,young,LT-HSC,3154,726849
GSM1433170,GSM1433170,2-3 months old,long term hematopoietic stem cell,C57BL/6,young,LT-HSC,4291,239890
...,...,...,...,...,...,...,...,...
GSM1642235,GSM1642235,2-3 months old,short term hematopoietic stem cell,DBA/2,young,ST-HSC,4663,174897
GSM1642236,GSM1642236,2-3 months old,short term hematopoietic stem cell,DBA/2,young,ST-HSC,5048,101714
GSM1642237,GSM1642237,2-3 months old,short term hematopoietic stem cell,DBA/2,young,ST-HSC,5402,63962
GSM1642238,GSM1642238,2-3 months old,short term hematopoietic stem cell,DBA/2,young,ST-HSC,5814,62491


In [38]:
adata.write('base.h5ad')

In [39]:
cell_cycle_genes = [x[0] + x[1:].strip().lower() for x in open('regev_lab_cell_cycle_genes.txt')]
print(cell_cycle_genes)

cell_cycle_genes = [x for x in cell_cycle_genes if x in qc3_df_log2_tpm.index]

['Mcm5', 'Pcna', 'Tyms', 'Fen1', 'Mcm2', 'Mcm4', 'Rrm1', 'Ung', 'Gins2', 'Mcm6', 'Cdca7', 'Dtl', 'Prim1', 'Uhrf1', 'Mlf1ip', 'Hells', 'Rfc2', 'Rpa2', 'Nasp', 'Rad51ap1', 'Gmnn', 'Wdr76', 'Slbp', 'Ccne2', 'Ubr7', 'Pold3', 'Msh2', 'Atad2', 'Rad51', 'Rrm2', 'Cdc45', 'Cdc6', 'Exo1', 'Tipin', 'Dscc1', 'Blm', 'Casp8ap2', 'Usp1', 'Clspn', 'Pola1', 'Chaf1b', 'Brip1', 'E2f8', 'Hmgb2', 'Cdk1', 'Nusap1', 'Ube2c', 'Birc5', 'Tpx2', 'Top2a', 'Ndc80', 'Cks2', 'Nuf2', 'Cks1b', 'Mki67', 'Tmpo', 'Cenpf', 'Tacc3', 'Fam64a', 'Smc4', 'Ccnb2', 'Ckap2l', 'Ckap2', 'Aurkb', 'Bub1', 'Kif11', 'Anp32e', 'Tubb4b', 'Gtse1', 'Kif20b', 'Hjurp', 'Cdca3', 'Hn1', 'Cdc20', 'Ttk', 'Cdc25c', 'Kif2c', 'Rangap1', 'Ncapd2', 'Dlgap5', 'Cdca2', 'Cdca8', 'Ect2', 'Kif23', 'Hmmr', 'Aurka', 'Psrc1', 'Anln', 'Lbr', 'Ckap5', 'Cenpe', 'Ctcf', 'Nek2', 'G2e3', 'Gas2l3', 'Cbx5', 'Cenpa']


In [40]:
qc3_df_log2_tpm.T.drop(cell_cycle_genes, axis=1).to_csv('data/gene_cell_matrix_log2_tpm_after_qc_cut_gene.csv')

In [41]:
adata_2 = sc.read_csv('data/gene_cell_matrix_log2_tpm_after_qc_cut_gene.csv')

In [42]:
adata_2

AnnData object with n_obs × n_vars = 1980 × 23630

In [43]:
adata_2.obs['id'] = label_data['id'].to_list()
adata_2.obs['Age'] = label_data['Age'].to_list()
adata_2.obs['Cell_type'] = label_data['Cell_type'].to_list()
adata.obs['strain'] = label_data['strain'].to_list()
adata_2.obs['y_o'] = label_data['y_o'].to_list()
adata_2.obs['c_type'] = label_data['c_type'].to_list()
adata_2.obs['gene_type'] = label_data['gene_type'].to_list()
adata_2.obs['read_counts'] = label_data['read_counts'].to_list()

adata_2.obs

Unnamed: 0,id,Age,Cell_type,y_o,c_type,gene_type,read_counts
GSM1433165,GSM1433165,2-3 months old,long term hematopoietic stem cell,young,LT-HSC,3511,566279
GSM1433166,GSM1433166,2-3 months old,long term hematopoietic stem cell,young,LT-HSC,3122,326142
GSM1433167,GSM1433167,2-3 months old,long term hematopoietic stem cell,young,LT-HSC,4606,359673
GSM1433169,GSM1433169,2-3 months old,long term hematopoietic stem cell,young,LT-HSC,3154,726849
GSM1433170,GSM1433170,2-3 months old,long term hematopoietic stem cell,young,LT-HSC,4291,239890
...,...,...,...,...,...,...,...
GSM1642235,GSM1642235,2-3 months old,short term hematopoietic stem cell,young,ST-HSC,4663,174897
GSM1642236,GSM1642236,2-3 months old,short term hematopoietic stem cell,young,ST-HSC,5048,101714
GSM1642237,GSM1642237,2-3 months old,short term hematopoietic stem cell,young,ST-HSC,5402,63962
GSM1642238,GSM1642238,2-3 months old,short term hematopoietic stem cell,young,ST-HSC,5814,62491


In [45]:
adata_2.write('cut_gene.h5ad')