In [1]:
'''
# Merge the follow data packages into consolidated data packages based on assay type.

+ KINOMEscan kinase small molecule binding assay
+ KiNativ kinase small molecule binding assay
+ Fluorescence imaging apoptosis assay
+ Fluorescence imaging cell growth inhibition assay

'''
__author__ = 'Zichen Wang (zichen.wang@mssm.edu)'

import sys
reload(sys)  
sys.setdefaultencoding('utf8')
# print sys.getdefaultencoding()

import os, tarfile, codecs
from collections import OrderedDict
import pandas as pd

def read_manifest(fn):
	# Read Manifest file
	df = pd.read_csv(fn, sep='\t')
	df.set_index('datasetid', inplace=True)
	df['metadata_contents'] = df['metadata_contents'].map(lambda x: x.split('\t'))
	return df

def read_data(fn):
	if fn.endswith('txt'):
		sep = '\t'
	elif fn.endswith('.csv'):
		sep = ','
	df = pd.read_csv(fn, sep=sep) 
	return df

def read_readme(fn):
	out = OrderedDict()
	with codecs.open(fn, 'r', 'utf-8') as f:
		is_section = False
		subsection = ''
		lines = [line for line in f]
		section_idx = set()
		for i, line in enumerate(lines):

			if i not in section_idx:
				if '####' in line:
					is_section = not is_section
					if is_section:
						section = lines[i+1].strip()
						out[section] = OrderedDict()
						out[section][subsection] = ''
						section_idx.add(i+1)
						# print section

				elif line.strip().endswith(':'):
					subsection = line.strip()
					# print subsection
					out[section][subsection] = ''
				else:
					out[section][subsection] += line
	# print out['Dataset Information']['Dataset Contents:']
	return out

def write_readme(d, fn):
	with open(fn, 'w') as out:
		for section in d:
			out.write('#' * 64 + '\n')
			out.write(section + '\n')
			out.write('#' * 64 + '\n')
			out.write('\n')
			for subsection in d[section]:
				out.write(subsection + '\n')
				out.write(d[section][subsection])
	return

def merge_readme(meta1, meta2):
	meta = OrderedDict()
	for section in meta1.keys():
		if meta1[section] == meta2[section]:
			meta[section] = meta1[section]
		else:
			meta[section] = OrderedDict()
			for subsection in meta1[section].keys():
				try:
					if meta1[section][subsection] == meta2[section][subsection]:
						meta[section][subsection] = meta1[section][subsection]
					else:
						meta[section][subsection] = ''
				except KeyError:
					pass
	return meta

def extract_tgz(fn):
	# To extract a .tar.gz
	dirname = fn.split('.')[0]
	tf = tarfile.open(fn, 'r')
	tf.extractall()
	return os.listdir(dirname)


In [2]:
def _merge(manifest_fn, dataset_ids_to_exclude=[]):
	manifest = read_manifest(manifest_fn)
	## drop unwanted rows in manifest
	manifest = manifest.drop(dataset_ids_to_exclude, axis=0)

	# print manifest.head()

	# Get unique metadata filenames
	unique_metadata_fns = set(reduce(lambda x,y: x+y, manifest['metadata_contents']))

	merged_metadata = dict(zip(unique_metadata_fns, 
		[pd.DataFrame()]*len(unique_metadata_fns))) # to collect merged metadata dfs

	merged_data_df = pd.DataFrame() # to collect merged data
	merged_readme = OrderedDict()

	for dataset_id in manifest.index:
		tgz_fn = '%s.tar.gz' % dataset_id 
		filenames = extract_tgz(tgz_fn)
		# Read and merge metadata
		metadata_fns = manifest.ix[dataset_id]['metadata_contents']
		# print metadata_fns
		for metadata_fn in metadata_fns:
			meta_df = read_data('%s/%s' % (dataset_id, metadata_fn))
			merged_metadata[metadata_fn] = merged_metadata[metadata_fn].append(meta_df)

		# Get dataset filename
		dataset_fn = filter(lambda x: x not in unique_metadata_fns, filenames)
		dataset_fn = filter(lambda x: 'readme' not in x.lower(), dataset_fn)
		# print dataset_fn
		print dataset_id # , filenames

		assert len(dataset_fn) == 1
		dataset_fn = dataset_fn[0]
		# Read and merge data
		df = read_data('%s/%s' % (dataset_id, dataset_fn))
		print df.columns
		merged_data_df = merged_data_df.append(df)
		# print 'Merged %s' % dataset_fn 
		# print df.shape, merged_data_df.shape

		# Read ReadMe.txt
		try:
			readme = read_readme('%s/ReadMe.txt' % dataset_id)
		except UnicodeDecodeError:
			pass
		else:
			if len(merged_readme) == 0:
				merged_readme = readme
			else:
				merged_readme = merge_readme(merged_readme, readme)

	# Drop duplicated rows
	# print merged_data_df.shape
	merged_data_df = merged_data_df.drop_duplicates()
	print merged_data_df.shape
	return merged_data_df

In [3]:
# KINOMEscan
merged_data_df = _merge('Manifest_1458120753563.txt')
merged_data_df.head(10)

Unnamed: 0,datarecordID,hmsDatasetID,smCenterCompoundID,smSalt,smCenterSampleID,smLincsID,smName,clName,clCenterSpecificID,ppName,ppLincsID,recordedPlate,recordedWell,controlType,datapointName,datapointUnit,datapointValue
0,50583,20040,10061,101,1,LSM-1061,NU7441,,,AAK1,200001,,,,percentControl,,87
1,50583,20040,10061,101,1,LSM-1061,NU7441,,,AAK1,200001,,,,assayCompoundConcentration,,10
2,50583,20040,10061,101,1,LSM-1061,NU7441,,,AAK1,200001,,,,concUnit,,uM
3,50584,20040,10061,101,1,LSM-1061,NU7441,,,ABL1(E255K)-phosphorylated,200004,,,,percentControl,,100
4,50584,20040,10061,101,1,LSM-1061,NU7441,,,ABL1(E255K)-phosphorylated,200004,,,,assayCompoundConcentration,,10
5,50584,20040,10061,101,1,LSM-1061,NU7441,,,ABL1(E255K)-phosphorylated,200004,,,,concUnit,,uM
6,50585,20040,10061,101,1,LSM-1061,NU7441,,,ABL1(F317I)-nonphosphorylated,200006,,,,percentControl,,100
7,50585,20040,10061,101,1,LSM-1061,NU7441,,,ABL1(F317I)-nonphosphorylated,200006,,,,assayCompoundConcentration,,10
8,50585,20040,10061,101,1,LSM-1061,NU7441,,,ABL1(F317I)-nonphosphorylated,200006,,,,concUnit,,uM
9,50586,20040,10061,101,1,LSM-1061,NU7441,,,ABL1(F317I)-phosphorylated,200007,,,,percentControl,,99


In [4]:
merged_data_df.shape

(130113, 17)

In [5]:
merged_data_df.count()

datarecordID          130113
hmsDatasetID          130113
smCenterCompoundID    130113
smSalt                130113
smCenterSampleID      130113
smLincsID             130113
smName                130113
clName                     0
clCenterSpecificID         0
ppName                130113
ppLincsID             130113
recordedPlate              0
recordedWell               0
controlType                0
datapointName         130113
datapointUnit              0
datapointValue        130113
dtype: int64

In [6]:
# drop columns with 0 data
cols_to_drop = merged_data_df.columns[merged_data_df.count() == 0]
merged_data_df = merged_data_df.drop(cols_to_drop, axis=1)
merged_data_df.shape

(130113, 11)

In [7]:
# groupby datarecordID and get the first rows in each group except for datapointValue and datapointName
grouped_df_meta = merged_data_df.groupby('datarecordID').head(1)\
    .drop(['datapointName', 'datapointValue'],axis=1)\
    .set_index('datarecordID')
grouped_df_meta.head()

Unnamed: 0_level_0,hmsDatasetID,smCenterCompoundID,smSalt,smCenterSampleID,smLincsID,smName,ppName,ppLincsID
datarecordID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
50583,20040,10061,101,1,LSM-1061,NU7441,AAK1,200001
50584,20040,10061,101,1,LSM-1061,NU7441,ABL1(E255K)-phosphorylated,200004
50585,20040,10061,101,1,LSM-1061,NU7441,ABL1(F317I)-nonphosphorylated,200006
50586,20040,10061,101,1,LSM-1061,NU7441,ABL1(F317I)-phosphorylated,200007
50587,20040,10061,101,1,LSM-1061,NU7441,ABL1(F317L)-nonphosphorylated,200009


In [8]:
grouped_df_meta.shape

(43371, 8)

In [9]:
# pivot_table to get datapointName on the columns
grouped_df = pd.pivot_table(merged_data_df[['datarecordID', 'datapointName', 'datapointValue']],
                            values='datapointValue', 
                            index=['datarecordID'], 
                            columns=['datapointName'],
                            aggfunc=lambda x: x,
                            dropna=False)

grouped_df.head()

Unnamed: 0_level_0,assayCompoundConcentration,concUnit,percentControl
datarecordID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
41743,10,uM,100.0
41744,10,uM,47.0
41745,10,uM,5.6
41746,10,uM,41.0
41747,10,uM,5.4


In [10]:
grouped_df.shape

(43371, 3)

In [11]:
grouped_df.count()

assayCompoundConcentration    43371
concUnit                      43371
percentControl                43371
dtype: int64

In [12]:
# join grouped_df and grouped_df_meta on datarecordID
grouped_df = grouped_df.merge(grouped_df_meta, left_index=True, right_index=True, how='inner')
# export this 
grouped_df.to_csv('KINOMEscan_kinase_small_molecule_binding_assay/HMS_LINCS-KINOMEscan_kinase_small_molecule_binding_assay.csv')
grouped_df.shape

(43371, 11)

In [13]:
## create experimentID to uniquely identify each experiment
grouped_df['experimentID'] = grouped_df[[
        'hmsDatasetID',        
        'smCenterCompoundID',        
        'assayCompoundConcentration']].apply(lambda x: '-'.join(map(str, x)), axis=1)


In [14]:
# check dtypes
grouped_df.dtypes

assayCompoundConcentration    object
concUnit                      object
percentControl                object
hmsDatasetID                   int64
smCenterCompoundID             int64
smSalt                         int64
smCenterSampleID               int64
smLincsID                     object
smName                        object
ppName                        object
ppLincsID                      int64
experimentID                  object
dtype: object

In [15]:
# separate row meta (kinases) and column meta (experiments)
row_meta_df = grouped_df.reset_index()[['ppName', 'ppLincsID']].drop_duplicates().set_index('ppLincsID').sort_index()
col_meta_df = grouped_df.reset_index()[['experimentID', 'assayCompoundConcentration', 'concUnit', 
                          'hmsDatasetID', 'smCenterCompoundID', 'smSalt',
                         'smCenterSampleID', 'smLincsID', 'smName']].drop_duplicates().set_index('experimentID').sort_index()
row_meta_df.head()

Unnamed: 0_level_0,ppName
ppLincsID,Unnamed: 1_level_1
200001,AAK1
200002,ABL1
200003,ABL1(E255K)
200004,ABL1(E255K)-phosphorylated
200005,ABL1(F317I)


In [16]:
row_meta_df.shape

(484, 1)

In [17]:
row_meta_df.apply(lambda x: x.nunique())

ppName    483
dtype: int64

In [18]:
col_meta_df.apply(lambda x: x.nunique())

assayCompoundConcentration      2
concUnit                        1
hmsDatasetID                  102
smCenterCompoundID            102
smSalt                          3
smCenterSampleID                1
smLincsID                     102
smName                        102
dtype: int64

In [19]:
col_meta_df.shape

(103, 8)

In [20]:
col_meta_df.head()

Unnamed: 0_level_0,assayCompoundConcentration,concUnit,hmsDatasetID,smCenterCompoundID,smSalt,smCenterSampleID,smLincsID,smName
experimentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
20020-10008-10,10,uM,20020,10008,101,1,LSM-1008,Sorafenib
20021-10017-10,10,uM,20021,10017,101,1,LSM-6015,HG-6-64-01
20022-10029-10,10,uM,20022,10029,101,1,LSM-1029,GW-5074
20023-10046-10,10,uM,20023,10046,101,1,LSM-1046,SB590885
20024-10049-10,10,uM,20024,10049,101,1,LSM-1049,PLX-4720


In [21]:
# pivot_table to make a matrix of percentInhibition
value_matrix = pd.pivot_table(grouped_df.reset_index()[['percentControl', 'ppLincsID', 'experimentID']].drop_duplicates(),
                            values='percentControl', 
                            columns='experimentID',
                            index='ppLincsID', 
                            aggfunc=lambda x: x,
                            dropna=False)

value_matrix.head()

Unnamed: 0_level_0,20020-10008-10,20021-10017-10,20022-10029-10,20023-10046-10,20024-10049-10,20025-10050-10,20026-10068-10,20027-10006-10,20028-10009-10,20029-10010-10,...,20211-10356-1,20211-10356-10,20220-10053-1,20221-10105-1,20222-10129-1,20223-10171-1,20224-10183-1,20225-10212-1,20227-10354-10,20228-10364-10
ppLincsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200001,100.0,26.0,5.2,76.0,84.0,100.0,100.0,1.3,6.9,100.0,...,61.0,13.0,100.0,100.0,92.0,100.0,76.0,100.0,7.6,3.4
200002,,,,,,,,,,,...,,,100.0,100.0,,100.0,,100.0,,
200003,,,,,,,,,,,...,,,100.0,100.0,,100.0,,100.0,,
200004,47.0,1.0,54.0,8.6,46.0,3.4,58.0,0.0,0.4,95.0,...,81.0,85.0,,,65.0,,67.0,,6.0,19.0
200005,,,,,,,,,,,...,,,100.0,100.0,,100.0,,100.0,,


In [22]:
value_matrix.shape

(484, 103)

In [23]:
grouped_df['percentControl'].count()

43371

In [24]:
value_matrix.count().sum()

43370

In [25]:
## make gct object
sys.path.append('/Users/zichen/Documents/GitHub/l1ktools/python')
import cmap.io.gct as gct
g = gct.GCT()
print value_matrix.shape
print row_meta_df.shape
print col_meta_df.shape
g.build_from_DataFrame(value_matrix, rdesc=row_meta_df, cdesc=col_meta_df)


In [26]:
## export to gct and gctx
g.write('KINOMEscan_kinase_small_molecule_binding_assay/HMS_LINCS-KINOMEscan_kinase_small_molecule_binding_assay'
        , mode='gct')