In [33]:
import numpy as np
from sklearn import svm
import pandas as pd
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import random
from scipy.io import mmread
import math
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import os
from matplotlib import gridspec
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import chi2,f_classif,mutual_info_classif,mutual_info_regression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from scipy.stats import chisquare
from scipy.stats import pearsonr
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [14]:
from scipy import io as sio
from scipy import sparse as ss
from scipy import optimize as so
from sklearn.utils.sparsefuncs import mean_variance_axis
from scipy.stats import linregress
import csv
import umap.umap_ as umap
import seaborn as sns
import copy
from sklearn.cluster import KMeans
import pickle
import numpy as np
import pandas as pd


In [15]:
def load_genes(genes):
    '''
    Load the gene names from a file

    Parameters
    ----------
    genes : str
        Path to a gene file
    '''
    try:
        genes = np.array([row[1].upper() for row in csv.reader(open(genes), delimiter="\t")]) # 10X format
    except:
        genes = np.array([row[0].upper() for row in csv.reader(open(genes), delimiter="\t")]) # base format with one gene name per row
    return genes

## Load in the data and create a metadata table
def load_multiple_samples(samples, barcodes, genefile):
    '''
    Load the gene names from a set of mtx files

    Parameters
    ----------
    samples : dict
        dictionary of sample names and path to matrix file
    barcodes : dict
        dictionary of sample names and path to barcodes file
    genefile : str
        Path to a gene file
    '''

    genes = load_genes(genefile)

    samplenames = list(samples.keys())
    sampleorder = list(np.sort(samplenames))

    Mlist = []
    meta=pd.DataFrame()
    for x in sampleorder: 
        print('Loading in sample: ' + x)
        currM = sio.mmread(samples[x]).tocsc()
        currbc = list(pd.read_csv(barcodes[x], header=None)[0])
        Mlist.append(currM)
        currmeta = pd.DataFrame({'sample':[x]*currM.shape[1]})
        currmeta.index = [currbc[i] + '_' + x for i in range(len(currbc))]  
        meta = meta.append(currmeta)

    M = ss.hstack(Mlist)
    
    return M, genes, meta

def load_consolidated_data(matrixfile, metafile, genefile):

    '''
    Load the gene names from a set of mtx files

    Parameters
    ----------
    matrixfile : string
        path to matrix file
    metafile : string
        path to metadata file 
    genefile : str
        Path to a gene file
    '''
    genes = load_genes(genefile)
    meta = pd.read_csv(metafile, header=0) 
    M = sio.mmread(matrixfile).tocsc()    
    return M, genes, meta

def normalize(M, meta):
    '''
    Normalize databy dividing by column sum

    Parameters
    ----------
    M : sparse matrix
        gene expression matrix
    meta : table
        metadata table
    '''


    # normalize by dividing by column sum
    sums = np.array(M.sum(axis=0)).flatten() # compute sums of all columns (cells)
    M.data = M.data.astype(float) # convert type from int to float prior to division

    for i in range(len(M.indptr)-1): # for each column i
        rr = range(M.indptr[i], M.indptr[i+1]) # get range rr
        M.data[rr] = M.data[rr]/sums[i] # divide data values by matching column sum

    # add transcript totals to the metadata table
    meta['transcript_total'] = sums
    
    return M, meta



In [16]:
# Define paths for files
matrixfile = 'data/consolidated/MM_all_matrix.mtx' 
metafile = 'data/consolidated/MM_all_metadata.csv'
genefile = 'data/consolidated/genes.tsv'

In [17]:
M, genes, meta = load_consolidated_data(matrixfile, metafile, genefile)

In [None]:
# normalize data by dividing by transcript sum
M, meta = normalize (M, meta)

In [25]:
meta = pd.read_csv(metafile, header=0)

### Filter out genes by Poisson

In [34]:
g_idx = filter_genes(M,1.1)

### Now rescale, log and filter the data
alpha = 10000
M2 = np.log1p(M[g_idx,:]*alpha)
genes2 = genes[g_idx]

### Define variables for active feature selection

In [35]:
data = M2
raw_target = meta['celltype'] # or meta['celltype_2']