In [80]:
%run ../yp_utils.py

# Initial setup

In [81]:
paper_pmid = 24360837
paper_name = 'hoepfner_movva_2014' 

In [82]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [83]:
datasets.set_index('pmid', inplace=True)

# Load & process the data - Benomyl

In [84]:
original_data1 = pd.read_csv('large_files/raw_data/HOP_scores-benomyl.txt', sep='\t')
original_data2 = pd.read_csv('large_files/raw_data/HIP_scores-benomyl.txt', sep='\t')

In [85]:
print('Original data dimensions: %d x %d' % (original_data1.shape))
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 6681 x 195
Original data dimensions: 6681 x 189


In [86]:
# Keep the sensitivity scores, not z-scores (z-score normalize each strain to its phenotype to all other compounds in the dataset)

In [87]:
cols1 = [c for c in original_data1.columns.values if 'z-score' not in c]
cols2 = [c for c in original_data2.columns.values if 'z-score' not in c]

In [88]:
original_data1 = original_data1.loc[:, cols1]
original_data2 = original_data2.loc[:, cols2]

In [89]:
orf_col = 'Systematic Name'

In [90]:
original_data1[orf_col] = original_data1[orf_col].astype(str)
original_data2[orf_col] = original_data2[orf_col].astype(str)

In [91]:
# Eliminate all white spaces & capitalize
original_data1[orf_col] = clean_orf(original_data1[orf_col])
original_data2[orf_col] = clean_orf(original_data2[orf_col])

In [92]:
# Translate to ORFs 
original_data1['orfs'] = translate_sc(original_data1[orf_col], to='orf')
original_data2['orfs'] = translate_sc(original_data2[orf_col], to='orf')

In [93]:
original_data1.loc[original_data1['orfs'] == 'YBR160WAS','orfs'] = 'YBR160W'
original_data2.loc[original_data2['orfs'] == 'YBR160WAS','orfs'] = 'YBR160W'

In [94]:
# Make sure everything translated ok
t = looks_like_orf(original_data1['orfs'])
print(original_data1.loc[~t,])

            Systematic Name  Ad. scores for Exp. 991_26.84_HOP_0018A  \
index_input                                                            
28                   R0010W                                      NaN   
29                   R0020C                                      NaN   
30                   R0030W                                      NaN   
31                   R0040C                                      NaN   

             Ad. scores for Exp. 991_26.84_HOP_0018B  \
index_input                                            
28                                               NaN   
29                                               NaN   
30                                               NaN   
31                                               NaN   

             Ad. scores for Exp. 991_26.84_HOP_0019A  \
index_input                                            
28                                               NaN   
29                                               NaN   
30    

In [95]:
# Make sure everything translated ok
t = looks_like_orf(original_data2['orfs'])
print(original_data2.loc[~t,])

            Systematic Name  Ad. scores for Exp. 991_26.84_HIP_0018A  \
index_input                                                            
28                   R0010W                                      NaN   
29                   R0020C                                      NaN   
30                   R0030W                                      NaN   
31                   R0040C                                      NaN   

             MADL scores for Exp. 991_26.84_HIP_0018B  \
index_input                                             
28                                                NaN   
29                                                NaN   
30                                                NaN   
31                                                NaN   

             Ad. scores for Exp. 991_26.84_HIP_0019A  \
index_input                                            
28                                               NaN   
29                                               NaN   


In [96]:
original_data1 = original_data1.loc[t,:]
original_data2 = original_data2.loc[t,:]

In [97]:
original_data1.set_index('orfs', inplace=True)
original_data2.set_index('orfs', inplace=True)
original_data1.index.name='orf'
original_data2.index.name='orf'

In [98]:
original_data1['data'] = original_data1.mean(axis=1)
original_data2['data'] = original_data2.mean(axis=1)

In [99]:
original_data = original_data1[['data']].join(original_data2[['data']], how='outer', lsuffix='_hop', rsuffix='_hip')

In [100]:
original_data = original_data.groupby(original_data.index).mean()

In [101]:
dataset_ids = [1087, 16622]
data_benomyl = original_data[['data_hop','data_hip']].copy()

In [102]:
data_benomyl.columns = dataset_ids

In [103]:
data_benomyl.head()

Unnamed: 0_level_0,1087,16622
orf,Unnamed: 1_level_1,Unnamed: 2_level_1
Q0010,,
Q0017,,
Q0032,,
Q0045,,
Q0050,,


# Load and process data -- all others

In [104]:
original_data1 = pd.read_csv('large_files/raw_data/HOP_scores.txt', sep='\t')
original_data2 = pd.read_csv('large_files/raw_data/HIP_scores.txt', sep='\t')

In [105]:
print('Original data dimensions: %d x %d' % (original_data1.shape))
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 6681 x 5847
Original data dimensions: 6681 x 5913


In [106]:
cols1 = [c for c in original_data1.columns.values if 'z-score' not in c]
cols2 = [c for c in original_data2.columns.values if 'z-score' not in c]

In [107]:
original_data1 = original_data1.loc[:, cols1]
original_data2 = original_data2.loc[:, cols2]

In [108]:
print('Original data dimensions: %d x %d' % (original_data1.shape))
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 6681 x 2924
Original data dimensions: 6681 x 2957


In [109]:
orf_col = 'Systematic Name'

In [110]:
original_data1[orf_col] = original_data1[orf_col].astype(str)
original_data2[orf_col] = original_data2[orf_col].astype(str)

In [111]:
# Eliminate all white spaces & capitalize
original_data1[orf_col] = clean_orf(original_data1[orf_col])
original_data2[orf_col] = clean_orf(original_data2[orf_col])

In [112]:
# Translate to ORFs 
original_data1['orfs'] = translate_sc(original_data1[orf_col], to='orf')
original_data2['orfs'] = translate_sc(original_data2[orf_col], to='orf')

In [113]:
original_data1.loc[original_data1['orfs'] == 'YBR160WAS','orfs'] = 'YBR160W'
original_data2.loc[original_data2['orfs'] == 'YBR160WAS','orfs'] = 'YBR160W'

In [114]:
# Make sure everything translated ok
t = looks_like_orf(original_data1['orfs'])
print(original_data1.loc[~t,])

            Systematic Name  Ad. scores for Exp. 2_200_HOP_0020A  \
index_input                                                        
28                   R0010W                                  NaN   
29                   R0020C                                  NaN   
30                   R0030W                                  NaN   
31                   R0040C                                  NaN   

             Ad. scores for Exp. 3_50_HOP_0078  \
index_input                                      
28                                         NaN   
29                                         NaN   
30                                         NaN   
31                                         NaN   

             Ad. scores for Exp. 6_173.545_HOP_0090  \
index_input                                           
28                                              NaN   
29                                              NaN   
30                                              NaN   
31             

In [115]:
# Make sure everything translated ok
t = looks_like_orf(original_data2['orfs'])
print(original_data2.loc[~t,])

            Systematic Name  Ad. scores for Exp. 2_200_HIP_0020A  \
index_input                                                        
28                   R0010W                                  NaN   
29                   R0020C                                  NaN   
30                   R0030W                                  NaN   
31                   R0040C                                  NaN   

             Ad. scores for Exp. 3_50_HIP_0077  \
index_input                                      
28                                         NaN   
29                                         NaN   
30                                         NaN   
31                                         NaN   

             Ad. scores for Exp. 6_173.545_HIP_0089  \
index_input                                           
28                                              NaN   
29                                              NaN   
30                                              NaN   
31             

In [116]:
original_data1 = original_data1.loc[t,:]
original_data2 = original_data2.loc[t,:]

In [117]:
original_data1.set_index('orfs', inplace=True)
original_data2.set_index('orfs', inplace=True)
original_data1.index.name='orf'
original_data2.index.name='orf'

In [118]:
original_data1.drop(columns=['Systematic Name'], inplace=True)

In [119]:
original_data2.drop(columns=['Systematic Name'], inplace=True)

### Map dataset IDs to data columns

In [120]:
compound_map = pd.read_csv('extras/type_cmb_dose_dataset.txt', sep='\t')

In [121]:
compound_map.loc[compound_map.loc[:,'Dataset HOP']==1226]

Unnamed: 0,Type,CMB,Dose,Dataset HOP,Dataset HIP
188,Ad.,1084,0.0005,1226,12043


In [122]:
dt_ids = []
for s in original_data1.columns.values:
    s_parts = re.split(' |_',s)
    cmb = int(s_parts[4])
    dose = float(s_parts[5])
    
    dt = compound_map.loc[(compound_map['CMB'] == cmb) & (round(compound_map['Dose'],4) == round(dose,4))]
    if dt.shape[0] > 0:
        dataset_id = dt['Dataset HOP'].values[0]
    else:
        dataset_id = np.nan
    
    dt_ids.append(dataset_id)


In [123]:
t = original_data1.drop(columns=original_data1.columns[np.isnan(np.array(dt_ids))])

In [124]:
dt_ids = np.array(dt_ids)[~np.isnan(np.array(dt_ids))]

In [125]:
dt_ids = dt_ids.astype(int)

In [126]:
t.columns = dt_ids

In [127]:
# Average values for duplicated (replicated) datasets
t = t.T
t = t.groupby(t.index).mean().T

In [128]:
t.shape

(6677, 245)

In [129]:
original_data1 = t.copy()

In [130]:
dt_ids = []
for s in original_data2.columns.values:
    s_parts = re.split(' |_',s)
    cmb = int(s_parts[4])
    dose = float(s_parts[5])
    
    dt = compound_map.loc[(compound_map['CMB'] == cmb) & (round(compound_map['Dose'],4) == round(dose,4))]
    if dt.shape[0] > 0:
        dataset_id = dt['Dataset HIP'].values[0]
    else:
        dataset_id = np.nan
    
    dt_ids.append(dataset_id)


In [131]:
t = original_data2.drop(columns=original_data2.columns[np.isnan(np.array(dt_ids))])

In [132]:
dt_ids = np.array(dt_ids)[~np.isnan(np.array(dt_ids))]

In [133]:
dt_ids = dt_ids.astype(int)

In [134]:
t.columns = dt_ids

In [135]:
# Average values for duplicated (replicated) datasets
t = t.T
t = t.groupby(t.index).mean().T

In [136]:
t.shape

(6677, 224)

In [137]:
original_data2 = t.copy()

In [138]:
original_data2.shape

(6677, 224)

### Average and merge

In [139]:
original_data1 = original_data1.groupby(original_data1.index).mean()

In [140]:
original_data2 = original_data2.groupby(original_data2.index).mean()

In [141]:
original_data = original_data1.join(original_data2, how='outer', lsuffix='_hop', rsuffix='_hip')

In [146]:
data_final = data_benomyl.join(original_data, how='outer', lsuffix='_benomyl', rsuffix='_other')

In [147]:
data_final.shape

(6620, 471)

In [148]:
data_benomyl.shape

(6620, 2)

In [149]:
original_data.shape

(5864, 469)

In [156]:
# Remove ORFs that are all NaNs
num_vals = data_final.notnull().sum(axis=1)

In [157]:
data_final = data_final.loc[num_vals>0,:]

In [158]:
data_final.shape

(5867, 471)

# Prepare final dataset

In [159]:
data = data_final.copy()

In [160]:
dataset_ids = data_final.columns.values
datasets = datasets.reindex(index=dataset_ids)

In [161]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [162]:
data.head()

dataset_id,1087,16622,456,1052,1053,1054,1055,1056,1057,1058,...,12118,12119,16666,16667,16668,16669,16670,16671,16672,16673
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL001C,0.153577,-0.039887,0.065251,-1.965097,-0.004813,-0.032265,-0.94114,-0.076135,0.072909,-1.365976,...,-0.12932,-0.062104,-0.577013,0.140638,-1.327481,-0.111386,0.666092,0.906919,0.890081,0.779373
YAL002W,-0.093247,0.261198,-7.26461,-7.892758,0.107382,-10.748092,-0.228372,0.014193,-7.588114,0.001553,...,1.916211,-0.066242,-0.418985,-2.55495,2.632633,0.013553,-0.420891,0.055596,0.270976,1.123511
YAL003W,,0.126743,,,,,,,,,...,0.088859,-0.684067,-0.053482,1.291628,-0.654019,0.356714,0.877756,0.014747,1.142378,-0.021768
YAL004W,0.195749,0.303275,-0.440025,-0.445849,0.054191,0.328757,-0.044282,-0.161686,-0.006767,-0.019377,...,-0.479153,-0.057955,1.15924,1.278874,1.295541,2.606816,0.901954,-0.223452,-6.253787,-0.028462
YAL005C,0.105597,-1.476444,0.886466,-0.108302,0.081409,-0.796944,-0.10844,-0.015199,0.045282,0.140557,...,-0.158417,0.22386,-3.966082,-10.466046,-2.659423,1.181533,-0.863032,-0.057493,-0.206393,-7.498802


## Subset to the genes currently in SGD

In [163]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [164]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,1087,16622,456,1052,1053,1054,1055,1056,1057,1058,...,12118,12119,16666,16667,16668,16669,16670,16671,16672,16673
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,0.153577,-0.039887,0.065251,-1.965097,-0.004813,-0.032265,-0.94114,-0.076135,0.072909,-1.365976,...,-0.12932,-0.062104,-0.577013,0.140638,-1.327481,-0.111386,0.666092,0.906919,0.890081,0.779373
2,YAL002W,-0.093247,0.261198,-7.26461,-7.892758,0.107382,-10.748092,-0.228372,0.014193,-7.588114,0.001553,...,1.916211,-0.066242,-0.418985,-2.55495,2.632633,0.013553,-0.420891,0.055596,0.270976,1.123511
3,YAL003W,,0.126743,,,,,,,,,...,0.088859,-0.684067,-0.053482,1.291628,-0.654019,0.356714,0.877756,0.014747,1.142378,-0.021768
1863,YAL004W,0.195749,0.303275,-0.440025,-0.445849,0.054191,0.328757,-0.044282,-0.161686,-0.006767,-0.019377,...,-0.479153,-0.057955,1.15924,1.278874,1.295541,2.606816,0.901954,-0.223452,-6.253787,-0.028462
4,YAL005C,0.105597,-1.476444,0.886466,-0.108302,0.081409,-0.796944,-0.10844,-0.015199,0.045282,0.140557,...,-0.158417,0.22386,-3.966082,-10.466046,-2.659423,1.181533,-0.863032,-0.057493,-0.206393,-7.498802


# Normalize

In [165]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [166]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [167]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,1087,16622,456,1052,1053,1054,1055,1056,1057,1058,...,12118,12119,16666,16667,16668,16669,16670,16671,16672,16673
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,0.153577,-0.039887,0.065251,-1.965097,-0.004813,-0.032265,-0.94114,-0.076135,0.072909,-1.365976,...,-0.19724,-0.218056,-0.542188,-0.006149,-0.695214,-0.027648,0.391079,1.20911,0.426105,0.32692
2,YAL002W,-0.093247,0.261198,-7.26461,-7.892758,0.107382,-10.748092,-0.228372,0.014193,-7.588114,0.001553,...,2.30214,-0.222446,-0.42195,-1.140231,1.444715,0.032857,-0.528355,0.135516,0.128879,0.543422
3,YAL003W,,0.126743,,,,,,,,,...,0.069347,-0.877865,-0.14385,0.478092,-0.331295,0.199041,0.570116,0.084001,0.54723,-0.177089
1863,YAL004W,0.195749,0.303275,-0.440025,-0.445849,0.054191,0.328757,-0.044282,-0.161686,-0.006767,-0.019377,...,-0.624691,-0.213655,0.778871,0.472726,0.72219,1.288709,0.590584,-0.216388,-3.003589,-0.1813
4,YAL005C,0.105597,-1.476444,0.886466,-0.108302,0.081409,-0.796944,-0.10844,-0.015199,0.045282,0.140557,...,-0.232793,0.085308,-3.120819,-4.46857,-1.414956,0.59848,-0.902345,-0.0071,-0.100301,-4.880992


# Print out

In [168]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [169]:
from IO.save_data_to_db3 import *

In [170]:
save_data_to_db(data_all, paper_pmid, delete=False)

  0%|          | 0/471 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 471/471 [1:02:29<00:00,  7.96s/it]

Updating the data_modified_on field...



