# Python notebook for processing single-cell data for downstream analysis

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
from sklearn.decomposition import PCA
from sklearn import preprocessing

### SVZ aging clock

In [2]:
# read in counts data from other project
data = pd.read_csv("sc_data/mouse_svz_aging_clock/svz_data.csv")
meta = pd.read_csv("sc_data/mouse_svz_aging_clock/svz_meta.csv")

In [65]:
# reformat, get top PCs and use that as the ground truth
PCmat = PCA(n_components=50).fit_transform(data.values)

# get associated metadata (celltypes)
label = meta["Celltype.LowRes"].values

SVZ_X = PCmat
SVZ_y = label
SVZ_age = meta["Age"].values

# # save X and Y
# np.savetxt("SVZ_X.txt", SVZ_X)
# np.savetxt("SVZ_y.txt", SVZ_y, fmt="%s")
# np.savetxt("SVZ_age.txt", SVZ_age, fmt="%s")

In [6]:
# SAVE DIFFERENTLY SIZED N

for size in [100, 200, 300, 400, 500, 600, 700, 800, 900]:
    PCmat = PCA(n_components=50).fit_transform(data.values[:size,:])
    # get associated metadata (celltypes)
    label = meta["Celltype.LowRes"].values[:size]
    SVZ_X = PCmat
    SVZ_y = label
    SVZ_age = meta["Age"].values[:size]
    np.savetxt("SVZ_X_n"+str(size)+".txt", SVZ_X)
    np.savetxt("SVZ_y_n"+str(size)+".txt", SVZ_y, fmt="%s")
    np.savetxt("SVZ_age_n"+str(size)+".txt", SVZ_age, fmt="%s")

### MERFISH mouse MOp

In [4]:
# # save as csv
# merfish_data = sc.read_h5ad("sc_data/mouse_merfish_MOp/counts.h5ad")
# np.savetxt("sc_data/mouse_merfish_MOp/counts.csv", merfish_data.X)
# print(merfish_data.X.shape)

(280327, 254)


In [2]:
merfish_data = sc.read_h5ad("sc_data/mouse_merfish_MOp/counts.h5ad")
sc.pp.normalize_total(merfish_data)
sc.pp.log1p(merfish_data)

In [3]:
merfish_data.shape

(280327, 254)

In [4]:
merfish_metadata = pd.read_csv("sc_data/mouse_merfish_MOp/cell_metadata.csv")

In [6]:
for sid in np.unique(merfish_metadata['sample_id']):
    print(len(merfish_metadata[merfish_metadata['sample_id'] == sid]['sample_id']))

18516
28195
23898
33884
31899
25133
13864
22221
18881
20616
20429
22791


In [37]:
# randomly sample 1000
rand_idxs = np.random.choice(np.arange(0,merfish_data.shape[0]), 1000, replace=False)

In [38]:
df_mer = pd.DataFrame(merfish_data.X[rand_idxs,:], index=merfish_data.obs_names[rand_idxs], columns=merfish_data.var_names)
#df_mer.to_csv("merfish_X.csv")

sub_merfish_metadata = merfish_metadata.iloc[rand_idxs]
#sub_merfish_metadata.to_csv("merfish_Y.csv")

#### Extracting replicates...

In [2]:
merfish_data = sc.read_h5ad("sc_data/mouse_merfish_MOp/counts.h5ad")
sc.pp.normalize_total(merfish_data)
sc.pp.log1p(merfish_data)

merfish_metadata = pd.read_csv("sc_data/mouse_merfish_MOp/cell_metadata.csv")

In [3]:
merfish_data.shape

(280327, 254)

In [4]:
for sample_id in np.unique(merfish_metadata['sample_id']): # 6 unique sample_id
    sample_idxs = merfish_metadata.index[merfish_metadata['sample_id'] == sample_id].tolist() # get idxs of sample
    #rand_idxs = np.random.choice(sample_idxs, 1000, replace=False) # subsample 1000 cells from that sample
    rand_idxs = np.random.choice(sample_idxs, 3000, replace=False) # subsample 3000 cells from that sample
    
    df_mer = pd.DataFrame(merfish_data.X[rand_idxs,:], index=merfish_data.obs_names[rand_idxs], columns=merfish_data.var_names)
    #df_mer.to_csv("merfish_"+sample_id+"_X.csv")
    df_mer.to_csv("merfish_"+sample_id+"_X_3k.csv")

    sub_merfish_metadata = merfish_metadata.iloc[rand_idxs]
    #sub_merfish_metadata.to_csv("merfish_"+sample_id+"_Y.csv")
    sub_merfish_metadata.to_csv("merfish_"+sample_id+"_Y_3k.csv")

### mESC differentiation

In [3]:
data = pd.read_csv("sc_data/mESC-differentiation_hayashi/GSE98664_tpm_sailfish_mergedGTF_RamDA_mESC_differentiation_time_course.txt", sep="\t", index_col=0)

In [4]:
data.shape

(157717, 421)

In [67]:
data = data.T

In [71]:
# reformat, get top PCs and use that as the ground truth
PCmat = PCA(n_components=50).fit_transform(data.values)

# get associated metadata (celltypes)
label = data.index

mESC_X = PCmat
mESC_y = label

# # save X and Y
# np.savetxt("mESC_X.txt", mESC_X)
# np.savetxt("mESC_y.txt", mESC_y, fmt="%s")

### E-MTAB-2805 cell-cycle

In [2]:
data1 = pd.read_csv("sc_data/E-MTAB-2805/G1_singlecells_counts.txt", sep="\t", index_col=2).drop(["EnsemblGeneID","EnsemblTranscriptID","GeneLength"], axis = 'columns').T
data2 = pd.read_csv("sc_data/E-MTAB-2805/G2M_singlecells_counts.txt", sep="\t", index_col=2).drop(["EnsemblGeneID","EnsemblTranscriptID","GeneLength"], axis = 'columns').T
data3 = pd.read_csv("sc_data/E-MTAB-2805/S_singlecells_counts.txt", sep="\t", index_col=2).drop(["EnsemblGeneID","EnsemblTranscriptID","GeneLength"], axis = 'columns').T

In [35]:
data1 = data1.loc[:, data1.columns.notnull()]
data2 = data2.loc[:, data2.columns.notnull()]
data3 = data3.loc[:, data3.columns.notnull()]

In [36]:
labels1 = ["G1"]*data1.shape[0]
labels2 = ["G2M"]*data2.shape[0]
labels3 = ["S"]*data3.shape[0]

In [37]:
data = np.vstack((data1,data2,data3))
labels = np.concatenate((labels1,labels2,labels3))

In [38]:
data = preprocessing.normalize(data, norm="l1")
data = np.log1p(data)

In [40]:
# reformat, get top PCs and use that as the ground truth
PCmat = PCA(n_components=50).fit_transform(data)

EMTAB_X = PCmat
EMTAB_y = labels

In [41]:
# np.savetxt("EMTAB_X.txt", EMTAB_X)
# np.savetxt("EMTAB_y.txt", EMTAB_y, fmt="%s")