In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

In [None]:
#Load data
X = np.loadtxt('Zeisel_expr.txt').T
genes = np.loadtxt('Zeisel_genes.txt', delimiter='\n', dtype=str)
labels = np.loadtxt('Zeisel_labels.txt', delimiter='\n', dtype=str)


In [None]:
#Filter and log-transform count matrix

keep_genes = np.sum(X, 0) >= 25
X = X[:, keep_genes]
genes = genes[keep_genes]

# Log transform
X = np.log2(1 + X)


In [None]:
#Useful Python-style tricks for working with the data:

#Trick 1
#You often want to slice the count matrix X to obtain only cells for a particular cell type (label).
#One common use case is that you want to plot cells of each type with a distinct color.
#Python supports condition-style boolean slicing which makes these kinds of queries easy.
#For example, the below code selects only the Astrocytes cells (subset of rows).

X_Astrocytes = X[labels == 'Astrocytes', :]

#Trick 2
#Sometimes you need to filter multiple arrays based on some condition.
#Then you typically need to store the boolean condition as a separate variable.
#For example, the below code filters both 'X' and 'genes' to get only genes whose name starts with a T.

T_filter = np.array([gene[0] == 'T' for gene in genes.tolist()])

genes_T = genes[T_filter]
X_T = X[:, T_filter]

#Trick 3
#Sometimes you need to sort an array based on the values of some other array.
#numpy has a good argsort function to accomplish this
#For example, the below code sorts 'genes' based on total read count across all cells

gene_sort_index = np.argsort(np.sum(X, axis=0))
genes_sorted = genes[gene_sort_index]


In [None]:
#Problem 1.1 Differential Gene Expression

#TODO: For each cell type (use the cell type labels from the data), identify the 3 most significant genes, using t-tests.
