# Aggregate Samples
This python notebook creates a .h5 file by concatenating multiple single cell sequencing data

In [2]:
import pandas as pd
import umap
import numpy as np
import matplotlib.pyplot as plt
import scipy as sc
import anndata

In [3]:
dataset_dir = "../../data/Huang/GSE214411_RAW/"
save_dir = "../../data/Huang/"

barcode_filename = "barcodes"
features_filename = "features"
matrix_filename = "matrix"

NUM_FILES = 1

In [4]:
# Load the barcode file
barcodes = {}
features = {}
matrixes = {}

for i in range(1, NUM_FILES + 1):
	barcode_file = dataset_dir + barcode_filename + " " + str(i) + ".tsv"
	features_file = dataset_dir + features_filename + " " + str(i) + ".tsv"
	matrix_file = dataset_dir + matrix_filename + " " + str(i) + ".mtx"

	barcodes[i] = pd.read_csv(barcode_file, sep="\t", header=None)
	features[i] = pd.read_csv(features_file, sep="\t", header=None)
	matrixes[i] = sc.io.mmread(matrix_file)

In [5]:
# check if all features are the same
for i in range(2, NUM_FILES + 1):
	assert features[i].equals(features[1])

features = features[1]

In [16]:
# create a single matrix
matrix = sc.sparse.hstack([matrixes[i].T for i in range(1, NUM_FILES + 1)])
matrix = matrix.tocsr().astype(np.float32)

In [8]:
# create list of barcodes
barcodes = pd.concat([barcodes[i] for i in range(1, NUM_FILES + 1)], ignore_index=True)

In [17]:
# create anndata object
adata = anndata.AnnData(X=matrix)

In [20]:
adata.obs_names = barcodes[0].values
adata.var_names = features[1].values
adata.var["gene_id"] = features[0].values

In [22]:
# save the anndata object
adata.write(save_dir + "GSE214411_All.h5ad")