# Method

In [None]:
!pip install activeSVC==4.0.1

In [None]:
import numpy as np
import time
import random
import os

from sklearn.preprocessing import normalize 
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from matplotlib import gridspec

from activeSVC import min_complexity, min_acquisition
import pandas as pd
import pickle
import os, psutil
import resource

import csv
from scipy.io import mmread
from scipy import io as sio
import copy

def text_create(path, name, msg):
    full_path = path + "/" + name + '.pickle'
    f=open(full_path,'wb') 
    pickle.dump(msg,f)
    f.close()

class TimerError(Exception):
     """A custom exception used to report errors in use of Timer class"""

class Timer:
    def __init__(self):
        self._start_time = None

    def start(self):
        if self._start_time is not None:
            raise TimerError(f"Timer is running. Use .stop() to stop it")

        self._start_time = time.perf_counter()

    def stop(self):
        if self._start_time is None:
            raise TimerError(f"Timer is not running. Use .start() to start it")

        elapsed_time = time.perf_counter() - self._start_time
        self._start_time = None
        print(f"Total run time: {elapsed_time:0.4f} seconds")
        return elapsed_time
        

def load_genes(genes):
    '''
    Load the gene names from a file

    Parameters
    ----------
    genes : str
        Path to a gene file
    '''
    try:
        genes = np.array([row[1].upper() for row in csv.reader(open(genes), delimiter="\t")]) # 10X format
    except:
        genes = np.array([row[0].upper() for row in csv.reader(open(genes), delimiter="\t")]) # base format with one gene name per row
    return genes

## Load in the data and create a metadata table
def load_multiple_samples(samples, barcodes, genefile):
    '''
    Load the gene names from a set of mtx files

    Parameters
    ----------
    samples : dict
        dictionary of sample names and path to matrix file
    barcodes : dict
        dictionary of sample names and path to barcodes file
    genefile : str
        Path to a gene file
    '''

    genes = load_genes(genefile)

    samplenames = list(samples.keys())
    sampleorder = list(np.sort(samplenames))

    Mlist = []
    meta=pd.DataFrame()
    for x in sampleorder: 
        print('Loading in sample: ' + x)
        currM = sio.mmread(samples[x]).tocsc()
        currbc = list(pd.read_csv(barcodes[x], header=None)[0])
        Mlist.append(currM)
        currmeta = pd.DataFrame({'sample':[x]*currM.shape[1]})
        currmeta.index = [currbc[i] + '_' + x for i in range(len(currbc))]  
        meta = meta.append(currmeta)

    M = ss.hstack(Mlist)

    return M, genes, meta

def load_consolidated_data(matrixfile, metafile, genefile):

    '''
    Load the gene names from a set of mtx files

    Parameters
    ----------
    matrixfile : string
        path to matrix file
    metafile : string
        path to metadata file 
    genefile : str
        Path to a gene file
    '''
    genes = load_genes(genefile)
    meta = pd.read_csv(metafile, header=0) 
    M = sio.mmread(matrixfile).tocsc()    
    return M, genes, meta

def normalization(M, meta):
    '''
    Normalize databy dividing by column sum

    Parameters
    ----------
    M : sparse matrix
        gene expression matrix
    meta : table
        metadata table
    '''

    # normalize by dividing by column sum
    M2 = copy.deepcopy(M)
    sums = np.array(M2.sum(axis=0)).flatten() # compute sums of all columns (cells)
    M2.data = M2.data.astype(float) # convert type from int to float prior to division

    for i in range(len(M2.indptr)-1): # for each column i
        rr = range(M2.indptr[i], M2.indptr[i+1]) # get range rr
        M2.data[rr] = M2.data[rr]/sums[i] # divide data values by matching column sum

    # add transcript totals to the metadata table
    meta['transcript_total'] = sums

    return M2, meta

def filter_genes(M, offset):
    '''
    Find indices of genes which have suprapoisson coefficient of variation

    Parameters
    ----------
    M : array
        genes-cells matrix
    offset : float (between 1 and 1.3)
        log10(offset) is added to the regression line when filtering genes. 
        Genes above the line are kept
        Values between 1 (no offset) and 1.3 work well

    '''
    # select genes above the line 
    mean, var = mean_variance_axis(M, axis=1) # get genes means and variances from M
    std = np.sqrt(var) # compute standard deviations from variances
    cv = np.divide(std, mean, out=np.zeros_like(std), where=mean!=0) # compute coefficient of variation

    # indices of genes that are present in more than .1% of the cells
    MCSR = M.tocsr() # MCSR is the row oriented version of M
    presence = np.array([MCSR.indptr[i+1]-MCSR.indptr[i] for i in range(M.shape[0])]) # count how many cells have non zeros expression for each gene
    presence_idx = np.where(presence>M.shape[1]*0.001)[0] # get indices of genes that are expressed in more than .1% of the cells
    MCSR = None

    nzidx = np.nonzero(mean)[0] # indices of genes with non-zero mean
    nzidx = np.intersect1d(nzidx, presence_idx) # get intersection of genes with non-zero mean and genes present in more than .1% of the cells

    nzcv = cv[nzidx] # select the matching cvs
    nzmean = mean[nzidx] # select the matching means
    lognzcv = np.log10(nzcv) # log10
    lognzmean = np.log10(nzmean) # log10

    slope, intercept, r_value, p_value, std_err = linregress(lognzmean, lognzcv)

    adjusted_intercept = intercept+np.log10(offset) # slide filtering line with offset
    selection_idx = np.where(lognzcv>lognzmean*slope+adjusted_intercept)[0] # get indices of genes above the filtering line
    final_idx = nzidx[selection_idx]

    # Plot the data just to see the selected genes
    plt.figure(figsize=(8,6))
    plt.scatter(lognzmean, lognzcv, s=1) # plot all genes
    plt.scatter(lognzmean[selection_idx], lognzcv[selection_idx], c='maroon', s=1) # plot genes above line with different color
    plt.plot(lognzmean,lognzmean*slope+adjusted_intercept, c='darkorange') # plot filtering line
    plt.xlabel('log10(mean)')
    plt.ylabel('log10(cv)')
    plt.title('%d genes selected' % len(selection_idx))

    return final_idx





# Load Data

In [None]:
import urllib.request
import shutil

try:
    os.mkdir('data')
except OSError:
    print ("Creation of the directory %s failed" % 'data')
else:
    print ("Successfully created the directory %s " % 'data')
    
with urllib.request.urlopen("https://caltech.box.com/shared/static/zywfv43qpeq0jydq80jgas016a3k7pc1.tsv") as response, open('./data/genes.tsv', "wb") as out_file:
    shutil.copyfileobj(response, out_file)

with urllib.request.urlopen("https://caltech.box.com/shared/static/0hlu0xanenjet6klf4zk74quk4o26oyw.csv") as response, open('./data/MM_all_metadata.csv', "wb") as out_file:
    shutil.copyfileobj(response, out_file)

with urllib.request.urlopen("https://caltech.box.com/shared/static/6iam30ygfutxehoy43xuowwhm77y2nxw.mtx") as response, open('./data/MM_all_matrix.mtx', "wb") as out_file:
    shutil.copyfileobj(response, out_file)


In [None]:
data_file='./data/MM_all_matrix.mtx'
gene_file='./data/genes.tsv'
label_file='./data/MM_all_metadata.csv'

M, raw_gene, meta = load_consolidated_data(data_file, label_file, gene_file)
M2, meta = normalization(M, meta)
meta = pd.read_csv(label_file,header=0)

l=[]
for i in range(np.shape(raw_gene)[0]):
    if raw_gene[i][:3]!='RPL' and raw_gene[i][:3]!='RPS' and raw_gene[i][:2]!='MT':
        l.append(i)
gene=raw_gene[l]

data=normalize(M2.tocsr().transpose()[:,l],axis=1, norm='l2')

raw_target=meta["disease"].values
target=(raw_target=='MM')*1.0
target=target.astype(np.uint8)
classes=range(len(np.unique(target)))
keys=['healthy','MM']
del M,M2,raw_gene,meta,raw_target

idx = np.arange(np.shape(data)[0])
random.shuffle(idx)
X_train = data[idx[:int(np.shape(data)[0]*4/5)],:]
y_train = target[idx[:int(np.shape(data)[0]*4/5)]]
X_test = data[idx[int(np.shape(data)[0]*4/5):],:]
y_test = target[idx[int(np.shape(data)[0]*4/5):]]

print(type(data))
print(np.shape(data),np.shape(target),len(np.unique(target)))
print(np.shape(X_train))
print(np.shape(X_test))
for i in np.unique(target):
    print('class '+keys[i]+': '+str(np.count_nonzero((target==i)*1)))

# Select Genes and Save Results

In [None]:
'''
min-complexity random
'''
num_features = 40
num_samples=20
init_samples=20
balance=False

path='results/random_40genes_20cells'

try:
    os.mkdir('results')
except OSError:
    print ("Creation of the directory %s failed" % 'results')
else:
    print ("Successfully created the directory %s " % 'results')
try:
    os.mkdir(path)
except OSError:
    print ("Creation of the directory %s failed" % path)
else:
    print ("Successfully created the directory %s " % path)
    
    
if __name__ == '__main__':

    t=Timer()
    t.start()
    feature_selected, num_samples_list, train_errors,test_errors,train_scores,test_scores, step_times= min_complexity(
        X_train,y_train,X_test,y_test,num_features=num_features,num_samples=num_samples,init_samples=init_samples, balance=balance)
    elapsed_time=t.stop()
    
    memorys=[]
    memorys.append(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)
    memorys.append(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/ 1024 ** 2)
    
    text_create(path,'feature_selected',feature_selected)
    text_create(path,'error',train_errors+test_errors)
    text_create(path,'accuracy',train_scores+test_scores)
    text_create(path,'num_samples_list',num_samples_list)
    text_create(path,'genes_name',gene[feature_selected])
    text_create(path,'elapsed_time',elapsed_time)
    text_create(path,'memory',memorys)

    plt.figure(figsize=(8,8))
    plt.plot(train_scores,linewidth=2)
    plt.plot(test_scores,linewidth=2)
    plt.legend(['train acc','test acc'],prop = {'size':18})
    plt.xlabel('number of genes',fontdict={'weight':'normal','size': 18})
    plt.ylabel('accuracy',fontdict={'weight':'normal','size': 18})
    plt.tick_params(labelsize=18)
    plt.savefig(path+'/acc.pdf', bbox_inches="tight")

feature1=np.copy(feature_selected)

In [None]:
'''
min-complexity balance
'''
num_features = 40
num_samples=20
init_samples=20
balance=True

path='results/balance_40genes_20cells'

try:
    os.mkdir('results')
except OSError:
    print ("Creation of the directory %s failed" % 'results')
else:
    print ("Successfully created the directory %s " % 'results')
try:
    os.mkdir(path)
except OSError:
    print ("Creation of the directory %s failed" % path)
else:
    print ("Successfully created the directory %s " % path)
    
    
if __name__ == '__main__':

    t=Timer()
    t.start()
    feature_selected, num_samples_list, train_errors,test_errors,train_scores,test_scores, step_times= min_complexity(
        X_train,y_train,X_test,y_test,num_features=num_features,num_samples=num_samples,init_samples=init_samples, balance=balance, class_weight='balanced')
    elapsed_time=t.stop()
    
    memorys=[]
    memorys.append(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)
    memorys.append(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/ 1024 ** 2)
    
    text_create(path,'feature_selected',feature_selected)
    text_create(path,'error',train_errors+test_errors)
    text_create(path,'accuracy',train_scores+test_scores)
    text_create(path,'num_samples_list',num_samples_list)
    text_create(path,'genes_name',gene[feature_selected])
    text_create(path,'elapsed_time',elapsed_time)
    text_create(path,'memory',memorys)

    plt.figure(figsize=(8,8))
    plt.plot(train_scores,linewidth=2)
    plt.plot(test_scores,linewidth=2)
    plt.legend(['train acc','test acc'],prop = {'size':18})
    plt.xlabel('number of genes',fontdict={'weight':'normal','size': 18})
    plt.ylabel('accuracy',fontdict={'weight':'normal','size': 18})
    plt.tick_params(labelsize=18)
    plt.savefig(path+'/acc.pdf', bbox_inches="tight")

feature2=np.copy(feature_selected)    


In [None]:
'''
min-cell
'''
num_features = 150
num_samples=100
init_samples=100
balance=False

path='results/mincell_150genes_100cells'

try:
    os.mkdir('results')
except OSError:
    print ("Creation of the directory %s failed" % 'results')
else:
    print ("Successfully created the directory %s " % 'results')
try:
    os.mkdir(path)
except OSError:
    print ("Creation of the directory %s failed" % path)
else:
    print ("Successfully created the directory %s " % path)
    
    
if __name__ == '__main__':

    t=Timer()
    t.start()
    feature_selected, num_samples_list, samples_global, train_errors,test_errors,train_scores,test_scores,step_times= min_acquisition(
        X_train,y_train,X_test,y_test,num_features=num_features,num_samples=num_samples, init_samples=init_samples)
    elapsed_time=t.stop()
    
    memorys=[]
    memorys.append(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)
    memorys.append(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/ 1024 ** 2)
    
    text_create(path,'feature_selected',feature_selected)
    text_create(path,'error',train_errors+test_errors)
    text_create(path,'accuracy',train_scores+test_scores)
    text_create(path,'num_samples_list',num_samples_list)
    text_create(path,'samples_global',samples_global)
    text_create(path,'genes_name',gene[feature_selected])
    text_create(path,'elapsed_time',elapsed_time)
    text_create(path,'memory',memorys)

    plt.figure(figsize=(8,8))
    plt.plot(train_scores,linewidth=2)
    plt.plot(test_scores,linewidth=2)
    plt.legend(['train acc','test acc'],prop = {'size':18})
    plt.xlabel('number of genes',fontdict={'weight':'normal','size': 18})
    plt.ylabel('accuracy',fontdict={'weight':'normal','size': 18})
    plt.tick_params(labelsize=18)
    plt.savefig(path+'/acc.pdf', bbox_inches="tight")

    plt.figure(figsize=(8,5))
    plt.plot(num_samples_list,linewidth=5)
    plt.xlabel('number of genes',fontdict={'weight':'normal','size': 18})
    plt.ylabel('number of cells acquired',fontdict={'weight':'normal','size': 18})
    plt.tick_params(labelsize=18)
    plt.savefig(path+'/cells.pdf', bbox_inches="tight")
    
feature3=np.copy(feature_selected) 
    

# Plots

In [None]:
alpha = 1000
data_show = np.log1p(data*alpha).toarray()


In [None]:
# tsne of original dataset
Y1 = pd.read_csv(label_file)["UMAP_1"].values
Y2 = pd.read_csv(label_file)["UMAP_2"].values
print(np.shape(Y1),np.shape(Y2))

plt.figure(figsize=(12,8))
c_cmap = plt.get_cmap('Dark2', len(classes))
plt.scatter(Y1,Y2,c=target,s=0.5,cmap=c_cmap)
cbar=plt.colorbar(ticks=np.arange(len(classes)))
plt.xticks([])
plt.yticks([])
cbar.ax.set_yticklabels(keys,fontdict={'weight':'normal','size': 28})
plt.savefig(path+'/tsne.pdf',bbox_inches="tight")


In [None]:
# tsne of selected genes
plt.figure(figsize=(12,8))
data_selected=data_show[:,feature_selected]

pca=PCA(n_components=40,
       copy=True,
       whiten=False,
       svd_solver='auto',
       tol=0.0,
       iterated_power='auto',
       random_state=None)
pcaproj=pca.fit_transform(data_selected)

Y_selected=TSNE(n_components=2).fit_transform(pcaproj)

c_cmap = plt.get_cmap('Dark2', len(classes))
plt.scatter(Y_selected[:,0],Y_selected[:,1],c=target,s=0.5,cmap=c_cmap)
cbar=plt.colorbar(ticks=np.arange(len(classes)))
plt.xticks([])
plt.yticks([])
cbar.ax.set_yticklabels(keys,fontdict={'weight':'normal','size': 28})
plt.savefig(path+'/tsne_selected.pdf',bbox_inches="tight")
import pickle
f=open(path+'/tsne_selected.pickle','wb') 
pickle.dump(Y_selected,f)
f.close()

In [None]:
# gene expression on tsne projection
features=list(set(feature1)|set(feature2)|set(feature3))
size=len(features)
plt.figure(figsize=(30,3*int(size/10)+3))
gs = gridspec.GridSpec(int(size/10)+1, 10, width_ratios=[7,7,7,7,7,7,7,7,7,7])

for i in range(size):
    f=features[i]
    ax0 = plt.subplot(gs[i])
    ax0.set_title(gene[f],fontdict={'weight':'normal','size': 18})
    plt.scatter(Y1,Y2,c=data_show[:,f],s=0.001)
    ax0.set_xticks([])
    ax0.set_yticks([])
plt.savefig(path+'/gene.png',dpi=300,  bbox_inches="tight")    