# Import what we need

In [1]:
import sys
import numpy as np
import pandas as pd
import scanpy.api as sc
import anndata as ad
from scipy.sparse import csr_matrix
import logging
import os
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import scipy.stats as stats
import statsmodels.formula.api as smf
import statsmodels.api as sm
import math
import gc
import random

The current input file is /ye/yelabstore3/Richard/PlottingScripts/ez_scanpy/CLUESImmVar_processed.h5ad with a time stamp of 01/03/2019

In [2]:
%matplotlib inline

##################
# Configure file #
##################
sc.settings.verbosity = 2
sc.settings.autoshow = False
##logging.basicConfig(filename="CLUESImmVar_processed.V4.1.log", level=logging.ERROR)
sc.settings.set_figure_params(dpi=100, dpi_save=600, format='png')

plt.rcParams["image.aspect"] = "equal"
##%matplotlib inline

# Read in the data, this could take a while

In [3]:
dirpath = os.getcwd()
name = "CLUESImmVar_nonorm.V4.1"
##processed_path = "/Users/yechun/Downloads/"+name+".h5ad"
processed_path = os.readlink(name+'.h5ad');
# processed_path = os.path.normpath(processed_path)
print(processed_path)
adata = sc.read(processed_path, cache=True)
##adata = sc.read(processed_path)
adata.shape

../../../../../../../Downloads/CLUESImmVar_nonorm.V4.1.h5ad
Only considering the two last: ['.1', '.h5ad'].
Only considering the two last: ['.1', '.h5ad'].


(834096, 32738)

# Make some diagnostic plots

In [4]:
adata.obs['well'] = adata.obs['well'].astype('category')
adata.var_names_make_unique()
logging.info(str('Data structure details: ' + str(adata)))
logging.info('Removing Erythrocytes.')
mat = csr_matrix(adata.X)
mat = mat[:, adata.var_names.isin(['HBB'])].todense()
adata = adata[np.ravel(mat <= 1)]
logging.info(str('Data structure details: ' + str(adata)))
# Extract list of genes                                                                                                                                                          
genelist = adata.var_names.tolist()
# Find mitochondrial genes                                                                                                                                                       
mito_genes_names = [gn for gn in genelist if gn.startswith('MT-')]
logging.info(str('Mito genes: ' + str(mito_genes_names)))
# Find indices of mitochondrial genes                                                                                                                                            
mito_genes = [genelist.index(gn) for gn in mito_genes_names]
# For each cell compute fraction of counts in mito genes vs. all genes                                                                                                           
adata.obs['percent_mito'] = np.ravel(np.sum(adata[:, mito_genes].X, axis=1)) / np.ravel(np.sum(adata.X, axis=1))
# Add the total counts per cell as observations-annotation to adata                                                                                                              
adata.obs['n_counts'] = np.ravel(adata.X.sum(axis=1))
# Clinical variates                                                                                                                                                              
diseasecovpath = 'v2.clinical.data.txt'
clinic_cov = pd.read_csv(diseasecovpath, sep="\t")
# Fix name to make it compatible with clinical variates                                                                                                                          
adata.obs['ind_cov'] = adata.obs['ind_cov'].astype('object')
adata.obs['ind_cov'][adata.obs['ind_cov'].isin(['1221'])] = '1221_1221'
adata.obs['ind_cov'][adata.obs['ind_cov'].isin(['1251'])] = '1251_1251'
adata.obs['ind_cov'][adata.obs['ind_cov'].isin(['1891'])] = '1891_1891'
indlist = np.unique(np.asarray(adata.obs['ind_cov'].tolist()))
# Add SLEDAI scores as covariate                                                                                                                                                 
logging.info('Add SLEDAI scores as a covariate.')
adata.obs['SLEDAI'] = adata.obs['ind_cov'].tolist()
for ii in range(len(indlist)):
    if adata.obs['disease_cov'][adata.obs['ind_cov'] == indlist[ii]][0] == 'healthy':
        adata.obs['SLEDAI'][adata.obs['ind_cov'] == indlist[ii]] = '0'
    else:
        score = clinic_cov['sledaiscore'][clinic_cov['genotypeid'].isin([indlist[ii]])].values.tolist()[0]
        adata.obs['SLEDAI'][adata.obs['ind_cov'] == indlist[ii]] = str(score)

logging.info('Add whether or not sequencing was performed at the Broad Institute as a covariate.')
# Get list of batches                                                                                                                                                            
batch_list = adata.obs['batch_cov'].tolist()
# Preallocate int array w/ size of batch_cov                                                                                                                                     
broad = np.zeros_like(np.asarray(batch_list))
# if immvar is in batch name, assign 1.                                                                                                                                          
for ii in range(len(batch_list)):
    if 'immvar' in batch_list[ii]:
        broad[ii] = 1
    else:
        broad[ii] = 0
# Make obs for Broad indication                                                                                                                                                  
adata.obs['Broad'] = np.asarray(broad, dtype=np.float32)

logging.info('Add gender as a covariate.')
individuals_list = adata.obs['ind_cov'].tolist()
female = np.zeros_like(np.asarray(individuals_list))
for ii in range(len(individuals_list)):
    if 'IGT' in individuals_list[ii]:
        female[ii] = 1
    else:
        if clinic_cov['female'][clinic_cov['genotypeid'].isin([individuals_list[ii]])].values.tolist()[0] == 1:
            female[ii] = 1
        else:
            female[ii] = 0
# Make obs for Broad indication                                                                                                                                                  
adata.obs['Female'] = np.asarray(female, dtype=np.float32)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
logging.info('Filtering cells')
# Filter cells that have more than 10% of counts coming from mitochondrial genes.                                                                                                
adata = adata[adata.obs['percent_mito'] < 0.10]
logging.info(str('Data structure details: ' + str(adata)))
# Filter cells with abnormally low gene counts, high gene counts.                                                                                                                
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_cells(adata, max_genes=2500)
##sc.pp.filter_genes(adata, min_cells=3)                                                                                                                                         
logging.info(str('Data structure details: ' + str(adata)))
logging.info('Saving raw counts')
adata.uns['barcodes'] = adata.obs_names.tolist()
adata.uns['genes'] = adata.var_names.tolist()
adata.uns['raw_counts'] = adata.X
logging.info('Normalizing total counts to 10,000')
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
logging.info('Log transforming data')
sc.pp.log1p(adata)
logging.info('Saving log(counts)+1 in .raw')
adata.raw = adata
# Add platelet genes for purposes of regressing out their signature                                                                                                              
adata.obs['PF4'] = adata.raw[:,"PF4"].X
adata.obs['SDPR'] = adata.raw[:,"SDPR"].X
adata.obs['GNG11'] = adata.raw[:,"GNG11"].X
adata.obs['PPBP'] = adata.raw[:,"PPBP"].X
logging.info('Making .obs into categories')
adata.strings_to_categoricals()
logging.info('Filtering genes')
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var['highly_variable']]


filtered out 1828 cells that have more than  2500 genes expressed


... storing 'ind_cov' as categorical
... storing 'SLEDAI' as categorical


In [6]:
sc.pp.scale(adata, max_value=10)

In [7]:
adata.shape

(819878, 1441)

In [8]:
# Set parameters                                                                                                                                                                 
intialization = 1
n_components = 20
resolution = 1.5
# Run louvain clustering on theoretical future gene expression per cell                                                                                                          
logging.info('Estimating louvain cluster identities for gene expression values.')
sc.pp.pca(adata, random_state=intialization, svd_solver='arpack')
logging.info('PCA complete.')

In [94]:
range(len(adata.obsm.X_pca[1,:]))

range(0, 50)

In [119]:
to_remove = pd.Series()
for ii in range(len(adata.obsm.X_pca[1,:])) :
        print(np.corrcoef(adata.obsm.X_pca[:,ii], adata.obs["PF4"])[0,1])
        if(np.corrcoef(adata.obsm.X_pca[:,ii], adata.obs["PF4"])[0,1] > 0.2):
            adata.obs["PC"+str(ii)] = adata.obsm.X_pca[:,ii];
            to_remove = to_remove.append(pd.Series(["PC"+str(ii)]))


0.0043551612816462425
-0.0395589601939156
0.20890597830186022
0.579017579454677
-0.021620188679262825
-0.06664892388102102
-0.013575431049236926
0.02903849116504073
-0.01623915444221783
-0.007037487899329796
-0.14412965931015334
0.0505164052491921
-0.1885876268246031
0.001604263454384873
-0.13461067179368505
0.014686972343506686
-0.006914694820190294
0.02207047955482304
0.032353061686525145
-0.021350897030112212
0.0019244482146279448
0.0013385091395880014
0.020077449327232714
0.0313720099901557
-0.0043613682837919335
-0.09429075415715553
0.002683958834981519
-0.137769737980755
-0.13634979088351595
0.021091566680917002
0.027503962511749327
-0.05576058362921118
-0.013783504727268152
0.01050430014974276
0.014220158061359057
-0.03905411364785942
0.009826017810805249
-0.022870620548632176
-0.008646063947974871
0.0015313947500116669
-0.0038085694224131174
-0.0161096559936687
-0.030789993922859728
-0.017105646548635087
0.017002356403954207
0.02507583761497021
0.018742055024368903
-0.015351813

In [136]:
to_remove.values

AttributeError: 'numpy.ndarray' object has no attribute 'list'

In [76]:
adata.obs['IFI44'] = adata.raw[:,"IFI44"].X

In [88]:
np.corrcoef(adata.obsm.X_pca[:,2], adata.obs["PF4"])

array([[1.        , 0.20890598],
       [0.20890598, 1.        ]])

In [139]:
to_remove.tolist()

['PC2', 'PC3']

In [137]:
blah = ['n_counts', 'percent_mito', 'Female', 'Broad']

In [138]:
type(blah)

list

In [144]:
blah+to_remove.tolist()

['n_counts', 'percent_mito', 'Female', 'Broad', ['PC2', 'PC3'], 'PC2', 'PC3']

In [141]:
blah

['n_counts', 'percent_mito', 'Female', 'Broad', ['PC2', 'PC3']]