This is an example notebook walking through the construction of the atlas

In [1]:
import numpy as np
import pandas as pd
import sklearn
import gc
import functions
import scipy
from general_processing.processing_functions import convert_symbols_to_ensembl, transform_to_percentile 

In [2]:
blood_atlas_colours = pd.read_csv('/Users/pwangel/Data/Metadata_dumps/blood_atlas_colours.tsv', sep='\t').set_index('Sample Source')
blood_atlas_colours = {key:value[0] for key, value in zip(blood_atlas_colours.index.values, blood_atlas_colours.values)}

Reading in data, including nadias annotations, excel spreadsheet with multiple tabs

In [2]:
data           = pd.read_csv('/Users/pwangel/Downloads/blood_atlas_expression_v7.1.tsv', sep='\t', index_col=0)
annotations    = pd.read_csv('/Users/pwangel/PlotlyWorkspace/combine_data/blood/outputs_for_front_end/blood_atlas_annotations.tsv', sep='\t', index_col=0)
#genes          = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_genes.tsv', sep='\t', index_col=0)

ext_data       = pd.read_csv('/Users/pwangel/Downloads/DC_expression_matrix.txt', sep='\t', index_col=0)
ext_annotations = pd.read_csv('/Users/pwangel/Downloads/dc_atlas_samples.txt', sep='\t', index_col=0) 

In [3]:
annotations = annotations.loc[data.columns]

In [4]:
print(ext_annotations.columns)
print(annotations.columns)
ext_annotations.rename(columns={"Platform Category":"Platform_Category", "Cell Type":"celltype"}, inplace=True)

Index(['Cell Type', 'Activation Status', 'Sample Source', 'Tissue Type',
       'Platform Category'],
      dtype='object')
Index(['Dataset', 'dataset_name', 'chip_id', 'replicate_group_id', 'organism',
       'sample_type', 'generic_sample_type', 'final_cell_type',
       'parental_cell_type', 'sex', 'labelling', 'name', 'description',
       'number of samples', 'sample types', 'platform', 'project', 'species',
       'Detection threshold', 'Platform_Category', 'Platform_Colour',
       'Platform_Symbol', 'display_metadata', 'tier1', 'tier2', 'tier3',
       'celltype', 'Blood_Cluster'],
      dtype='object')


In [5]:
ext_annotations.index = ["_".join(i.split("_")[1:])+";"+i.split("_")[0] for i in ext_annotations.index.values]
ext_data.columns = ext_annotations.index
ext_annotations = ext_annotations.loc[np.setdiff1d(ext_annotations.index, annotations.index)]
ext_data = ext_data.loc[:,ext_annotations.index]
print(np.intersect1d(annotations.index.values, ext_annotations.index.values).shape)
ext_annotations.to_csv('/Users/pwangel/Downloads/new_dc_blood.tsv', sep='\t')

(0,)


In [47]:
ext_annotations

Unnamed: 0,celltype,Activation Status,Sample Source,Tissue Type,Platform_Category
GSM1217747;2494,CD141+ dendritic cell,normal,in vivo,small intestine,HuGene
GSM1217748;2494,CD141+ dendritic cell,normal,in vivo,small intestine,HuGene
GSM1217749;2494,CD141+ dendritic cell,normal,in vivo,small intestine,HuGene
GSM1217750;2494,CD1c+ dendritic cell,normal,in vivo,small intestine,HuGene
GSM1217751;2494,CD1c+ dendritic cell,normal,in vivo,small intestine,HuGene
...,...,...,...,...,...
GSM868917;6638,CD1c+ dendritic cell,normal,in vivo,skin,Illumina V4
GSM868922;6638,CD141+ dendritic cell,normal,in vivo,skin,Illumina V4
GSM868923;6638,CD141+ dendritic cell,normal,in vivo,skin,Illumina V4
GSM868924;6638,CD141+ dendritic cell,normal,in vivo,skin,Illumina V4


In [48]:
data = data.merge(ext_data, how='inner', left_index=True, right_index=True)
annotations = pd.concat([annotations, ext_annotations])


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [49]:
print(annotations.shape)
print(data.shape)

(1057, 31)
(13087, 1057)


In [50]:
data = transform_to_percentile(data)

Only need to compute gene variance fraction if not done already, in the above we have already read a previously calculated version into the gene dataframe

In [51]:
#genes = functions.calculate_platform_dependence(data, annotations)
#genes.to_csv('/Users/pwangel/Downloads/temp_ext_blood_atlas_genes.tsv', sep='\t') 
genes = pd.read_csv('/Users/pwangel/Downloads/temp_ext_blood_atlas_genes.tsv', sep='\t', index_col=0) 

In [52]:
pca        = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
pca.fit(transform_to_percentile(data.loc[genes.Platform_VarFraction.values<=0.2]).transpose())
pca_coords = pca.transform(transform_to_percentile(data.loc[genes.Platform_VarFraction.values<=0.2]).transpose())

Plot the pca

In [54]:
functions.plot_pca(pca_coords, annotations,pca, \
                   labels=['celltype', 'Platform_Category', 'Dataset'], colour_dict=blood_atlas_colours, out_file='/Users/pwangel/Downloads/blood_atlas_with_ext_dc.html')

In [7]:
functions.plot_gene_platform_dependence_distribution(data, annotations, genes)

Make a graph of the threshold lowering process using the Kruskal Wallis H Test

In [None]:
functions.plot_KW_Htest(data, annotations, genes)

Assessing platform dependence for principal components with varying threshold.
Analysing threshold of 0.020000 (71 genes)
Analysing threshold of 0.040000 (279 genes)
Analysing threshold of 0.060000 (533 genes)
Analysing threshold of 0.080000 (878 genes)
Analysing threshold of 0.100000 (1253 genes)
Analysing threshold of 0.120000 (1690 genes)
Analysing threshold of 0.140000 (2179 genes)
Analysing threshold of 0.160000 (2648 genes)
