In [1]:
import numpy as np
import pandas as pd

import sys
import re

from os.path import expanduser
sys.path.append(expanduser('~') + '/Lab/Utils/Python/')

from Conversions.translate import *
from Strings.is_a import *

# Initial setup

In [2]:
paper_pmid = 24360837
paper_name = 'hoepfner_movva_2014' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [4]:
datasets.set_index('pmid', inplace=True)

# Load & process the data - Benomyl

In [5]:
original_data1 = pd.read_csv('large_files/raw_data/HOP_scores-benomyl.txt', sep='\t')
original_data2 = pd.read_csv('large_files/raw_data/HIP_scores-benomyl.txt', sep='\t')

In [6]:
print('Original data dimensions: %d x %d' % (original_data1.shape))
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 6681 x 195
Original data dimensions: 6681 x 189


In [7]:
# Keep the sensitivity scores, not z-scores (z-score normalize each strain to its phenotype to all other compounds in the dataset)

In [8]:
cols1 = [c for c in original_data1.columns.values if 'z-score' not in c]
cols2 = [c for c in original_data2.columns.values if 'z-score' not in c]

In [9]:
original_data1 = original_data1.loc[:, cols1]
original_data2 = original_data2.loc[:, cols2]

In [10]:
orf_col = 'Systematic Name'

In [11]:
original_data1[orf_col] = original_data1[orf_col].astype(str)
original_data2[orf_col] = original_data2[orf_col].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data1[orf_col] = clean_orf(original_data1[orf_col])
original_data2[orf_col] = clean_orf(original_data2[orf_col])

In [13]:
# Translate to ORFs 
original_data1['orfs'] = translate_sc(original_data1[orf_col], to='orf')
original_data2['orfs'] = translate_sc(original_data2[orf_col], to='orf')

In [14]:
original_data1.loc[original_data1['orfs'] == 'YBR160WAS','orfs'] = 'YBR160W'
original_data2.loc[original_data2['orfs'] == 'YBR160WAS','orfs'] = 'YBR160W'

In [15]:
# Make sure everything translated ok
t = looks_like_orf(original_data1['orfs'])
print(original_data1.loc[~t,])

            Systematic Name  Ad. scores for Exp. 991_26.84_HOP_0018A  \
index_input                                                            
28                   R0010W                                      NaN   
29                   R0020C                                      NaN   
30                   R0030W                                      NaN   
31                   R0040C                                      NaN   

             Ad. scores for Exp. 991_26.84_HOP_0018B  \
index_input                                            
28                                               NaN   
29                                               NaN   
30                                               NaN   
31                                               NaN   

             Ad. scores for Exp. 991_26.84_HOP_0019A  \
index_input                                            
28                                               NaN   
29                                               NaN   
30    

In [16]:
# Make sure everything translated ok
t = looks_like_orf(original_data2['orfs'])
print(original_data2.loc[~t,])

            Systematic Name  Ad. scores for Exp. 991_26.84_HIP_0018A  \
index_input                                                            
28                   R0010W                                      NaN   
29                   R0020C                                      NaN   
30                   R0030W                                      NaN   
31                   R0040C                                      NaN   

             MADL scores for Exp. 991_26.84_HIP_0018B  \
index_input                                             
28                                                NaN   
29                                                NaN   
30                                                NaN   
31                                                NaN   

             Ad. scores for Exp. 991_26.84_HIP_0019A  \
index_input                                            
28                                               NaN   
29                                               NaN   


In [17]:
original_data1 = original_data1.loc[t,:]
original_data2 = original_data2.loc[t,:]

In [18]:
original_data1.set_index('orfs', inplace=True)
original_data2.set_index('orfs', inplace=True)

In [19]:
original_data1['data'] = original_data1.mean(axis=1)
original_data2['data'] = original_data2.mean(axis=1)

In [20]:
original_data = original_data1[['data']].join(original_data2[['data']], how='outer', lsuffix='_hop', rsuffix='_hip')

In [21]:
dataset_ids = [1087, 16622]

In [22]:
data_benomyl = original_data[['data_hop','data_hip']].copy()

In [23]:
data_benomyl.columns = dataset_ids

In [24]:
data_benomyl = data_benomyl.groupby(data_benomyl.index).mean()

In [25]:
# Create row index
data_benomyl.index.name='orf'

In [26]:
num_vals = data_benomyl.notnull().sum(axis=1)

In [27]:
data_benomyl = data_benomyl.loc[num_vals > 0]

In [28]:
print('Final data dimensions: %d x %d' % (data_benomyl.shape))

Final data dimensions: 5867 x 2


In [29]:
data_benomyl.head()

Unnamed: 0_level_0,1087,16622
orf,Unnamed: 1_level_1,Unnamed: 2_level_1
YAL001C,0.153577,-0.039887
YAL002W,-0.093247,0.261198
YAL003W,,0.126743
YAL004W,0.195749,0.303275
YAL005C,0.105597,-1.476444


# Load and process data -- all others

In [30]:
original_data1 = pd.read_csv('large_files/raw_data/HOP_scores.txt', sep='\t')
original_data2 = pd.read_csv('large_files/raw_data/HIP_scores.txt', sep='\t')

In [31]:
print('Original data dimensions: %d x %d' % (original_data1.shape))
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 6681 x 5847
Original data dimensions: 6681 x 5913


In [32]:
cols1 = [c for c in original_data1.columns.values if 'z-score' not in c]
cols2 = [c for c in original_data2.columns.values if 'z-score' not in c]

In [33]:
original_data1 = original_data1.loc[:, cols1]
original_data2 = original_data2.loc[:, cols2]

In [34]:
print('Original data dimensions: %d x %d' % (original_data1.shape))
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 6681 x 2924
Original data dimensions: 6681 x 2957


In [35]:
orf_col = 'Systematic Name'

In [36]:
original_data1[orf_col] = original_data1[orf_col].astype(str)
original_data2[orf_col] = original_data2[orf_col].astype(str)

In [37]:
# Eliminate all white spaces & capitalize
original_data1[orf_col] = clean_orf(original_data1[orf_col])
original_data2[orf_col] = clean_orf(original_data2[orf_col])

In [38]:
# Translate to ORFs 
original_data1['orfs'] = translate_sc(original_data1[orf_col], to='orf')
original_data2['orfs'] = translate_sc(original_data2[orf_col], to='orf')

In [39]:
original_data1.loc[original_data1['orfs'] == 'YBR160WAS','orfs'] = 'YBR160W'
original_data2.loc[original_data2['orfs'] == 'YBR160WAS','orfs'] = 'YBR160W'

In [40]:
# Make sure everything translated ok
t = looks_like_orf(original_data1['orfs'])
print(original_data1.loc[~t,])

            Systematic Name  Ad. scores for Exp. 2_200_HOP_0020A  \
index_input                                                        
28                   R0010W                                  NaN   
29                   R0020C                                  NaN   
30                   R0030W                                  NaN   
31                   R0040C                                  NaN   

             Ad. scores for Exp. 3_50_HOP_0078  \
index_input                                      
28                                         NaN   
29                                         NaN   
30                                         NaN   
31                                         NaN   

             Ad. scores for Exp. 6_173.545_HOP_0090  \
index_input                                           
28                                              NaN   
29                                              NaN   
30                                              NaN   
31             

In [41]:
# Make sure everything translated ok
t = looks_like_orf(original_data2['orfs'])
print(original_data2.loc[~t,])

            Systematic Name  Ad. scores for Exp. 2_200_HIP_0020A  \
index_input                                                        
28                   R0010W                                  NaN   
29                   R0020C                                  NaN   
30                   R0030W                                  NaN   
31                   R0040C                                  NaN   

             Ad. scores for Exp. 3_50_HIP_0077  \
index_input                                      
28                                         NaN   
29                                         NaN   
30                                         NaN   
31                                         NaN   

             Ad. scores for Exp. 6_173.545_HIP_0089  \
index_input                                           
28                                              NaN   
29                                              NaN   
30                                              NaN   
31             

In [42]:
original_data1 = original_data1.loc[t,:]
original_data2 = original_data2.loc[t,:]

In [43]:
original_data1.set_index('orfs', inplace=True)
original_data2.set_index('orfs', inplace=True)

In [44]:
original_data1.drop(columns=['Systematic Name'], inplace=True)

In [45]:
original_data2.drop(columns=['Systematic Name'], inplace=True)

### Map dataset IDs to data columns

In [46]:
compound_map = pd.read_csv('extras/type_cmb_dose_dataset.txt', sep='\t')

In [47]:
compound_map.loc[compound_map.loc[:,'Dataset HOP']==1226]

Unnamed: 0,Type,CMB,Dose,Dataset HOP,Dataset HIP
188,Ad.,1084,0.0005,1226,12043


In [48]:
dt_ids = []
for s in original_data1.columns.values:
    s_parts = re.split(' |_',s)
    cmb = int(s_parts[4])
    dose = float(s_parts[5])
    
    dt = compound_map.loc[(compound_map['CMB'] == cmb) & (round(compound_map['Dose'],4) == round(dose,4))]
    if dt.shape[0] > 0:
        dataset_id = dt['Dataset HOP'].values[0]
    else:
        dataset_id = np.nan
    
    dt_ids.append(dataset_id)


In [49]:
t = original_data1.drop(columns=original_data1.columns[np.isnan(np.array(dt_ids))])

In [50]:
dt_ids = np.array(dt_ids)[~np.isnan(np.array(dt_ids))]

In [51]:
dt_ids = dt_ids.astype(int)

In [52]:
t.columns = dt_ids

In [53]:
# Average values for duplicated (replicated) datasets
t = t.T
t = t.groupby(t.index).mean().T

In [54]:
t.shape

(6677, 245)

In [55]:
original_data1 = t.copy()

In [56]:
dt_ids = []
for s in original_data2.columns.values:
    s_parts = re.split(' |_',s)
    cmb = int(s_parts[4])
    dose = float(s_parts[5])
    
    dt = compound_map.loc[(compound_map['CMB'] == cmb) & (round(compound_map['Dose'],4) == round(dose,4))]
    if dt.shape[0] > 0:
        dataset_id = dt['Dataset HIP'].values[0]
    else:
        dataset_id = np.nan
    
    dt_ids.append(dataset_id)


In [57]:
t = original_data2.drop(columns=original_data2.columns[np.isnan(np.array(dt_ids))])

In [58]:
dt_ids = np.array(dt_ids)[~np.isnan(np.array(dt_ids))]

In [59]:
dt_ids = dt_ids.astype(int)

In [60]:
t.columns = dt_ids

In [61]:
# Average values for duplicated (replicated) datasets
t = t.T
t = t.groupby(t.index).mean().T

In [62]:
t.shape

(6677, 224)

In [63]:
original_data2 = t.copy()

In [64]:
original_data2.shape

(6677, 224)

### Average and merge

In [65]:
original_data1 = original_data1.groupby(original_data1.index).mean()

In [66]:
original_data2 = original_data2.groupby(original_data2.index).mean()

In [67]:
original_data = original_data1.join(original_data2, how='outer', lsuffix='_hop', rsuffix='_hip')

In [68]:
# Remove ORFs that are all NaNs
num_vals = original_data.notnull().sum(axis=1)

In [69]:
original_data = original_data.loc[num_vals>0,:]

In [70]:
original_data.head()

Unnamed: 0_level_0,456,1052,1053,1054,1055,1056,1057,1058,1059,1060,...,12118,12119,16666,16667,16668,16669,16670,16671,16672,16673
orfs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YAL001C,0.065251,-1.965097,-0.004813,-0.032265,-0.94114,-0.076135,0.072909,-1.365976,3.076409,0.248131,...,-0.12932,-0.062104,-0.577013,0.140638,-1.327481,-0.111386,0.666092,0.906919,0.890081,0.779373
YAL002W,-7.26461,-7.892758,0.107382,-10.748092,-0.228372,0.014193,-7.588114,0.001553,2.177835,0.476039,...,1.916211,-0.066242,-0.418985,-2.55495,2.632633,0.013553,-0.420891,0.055596,0.270976,1.123511
YAL003W,,,,,,,,,,,...,0.088859,-0.684067,-0.053482,1.291628,-0.654019,0.356714,0.877756,0.014747,1.142378,-0.021768
YAL004W,-0.440025,-0.445849,0.054191,0.328757,-0.044282,-0.161686,-0.006767,-0.019377,-0.02407,-0.078569,...,-0.479153,-0.057955,1.15924,1.278874,1.295541,2.606816,0.901954,-0.223452,-6.253787,-0.028462
YAL005C,0.886466,-0.108302,0.081409,-0.796944,-0.10844,-0.015199,0.045282,0.140557,0.01787,0.078094,...,-0.158417,0.22386,-3.966082,-10.466046,-2.659423,1.181533,-0.863032,-0.057493,-0.206393,-7.498802


In [71]:
data_benomyl.head()

Unnamed: 0_level_0,1087,16622
orf,Unnamed: 1_level_1,Unnamed: 2_level_1
YAL001C,0.153577,-0.039887
YAL002W,-0.093247,0.261198
YAL003W,,0.126743
YAL004W,0.195749,0.303275
YAL005C,0.105597,-1.476444


In [72]:
data_final = data_benomyl.join(original_data, how='outer', lsuffix='_benomyl', rsuffix='_other')

In [73]:
data_final.shape

(5867, 471)

In [74]:
data_benomyl.shape

(5867, 2)

In [75]:
original_data.shape

(5864, 469)

# Prepare final dataset

In [77]:
dataset_ids = data_final.columns.values
datasets = datasets.reindex(index=dataset_ids)

In [78]:
data = data_final.copy()

In [79]:
data.columns = datasets['name'].values

In [80]:
data = data.groupby(data.index).mean()

In [81]:
# Create row index
data.index.name='orf'

In [82]:
print('Final data dimensions: %d x %d' % (data.shape))

Final data dimensions: 5867 x 471


# Print out

In [83]:
data.to_csv(paper_name + '.txt', sep='\t')

# Save to DB

In [30]:
from IO.save_data_to_db2 import *

In [31]:
# Create column index
lst = [datasets.index.values, datasets['name'].values]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','dataset_name'])
data.columns = idx

In [33]:
save_data_to_db(data, paper_pmid, delete=False)