In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 32919014
paper_name = 'guan_zhang_2020' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [34]:
original_data = pd.read_excel('raw_data/Table S1 Normolized counts.xlsx', sheet_name='69samples normolized bigger tha', skiprows=1)

In [35]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 3766 x 49


In [36]:
original_data.head()

Unnamed: 0.1,Unnamed: 0,DMSO_1,DMSO_2,DMSO_3,Pro_max_1,Pro_max_2,Pro_max_3,Pro_med_1,Pro_med_2,Pro_med_3,...,Flu_min_3,Epo_max_1,Epo_max_2,Epo_max_3,Epo_med_1,Epo_med_2,Epo_med_3,Epo_min_1,Epo_min_2,Epo_min_3
0,YAL001C,243,246,214,301,249,253,300,309,358,...,266,228,252,327,305,228,282,331,380,282
1,YAL003W,283,336,376,391,323,392,369,347,328,...,270,303,423,437,442,413,474,376,477,312
2,YAL033W,122,212,167,204,175,193,202,165,222,...,198,21,13,173,172,178,213,192,234,156
3,YAL038W,173,163,154,103,172,140,101,109,117,...,145,170,101,144,162,119,130,145,195,112
4,YAL043C,122,112,130,50,118,112,135,93,112,...,109,99,65,133,104,164,140,108,90,112


In [37]:
original_data['orf'] = original_data['Unnamed: 0'].astype(str)

In [38]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [39]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [40]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [Unnamed: 0, DMSO_1, DMSO_2, DMSO_3, Pro_max_1, Pro_max_2, Pro_max_3, Pro_med_1, Pro_med_2, Pro_med_3, Pro_min_1, Pro_min_2, Pro_min_3, Pen_max_1, Pen_max_2, Pen_max_3, Pen_med_1, Pen_med_2, Pen_med_3, Pen_min_1, Pen_min_2, Pen_min_3, Teb_max_1, Teb_max_2, Teb_max_3, Teb_med_1, Teb_med_2, Teb_med_3, Teb_min_1, Teb_min_2, Teb_min_3, Flu_max_1, Flu_max_2, Flu_max_3, Flu_med_1, Flu_med_2, Flu_med_3, Flu_min_1, Flu_min_2, Flu_min_3, Epo_max_1, Epo_max_2, Epo_max_3, Epo_med_1, Epo_med_2, Epo_med_3, Epo_min_1, Epo_min_2, Epo_min_3, orf]
Index: []

[0 rows x 50 columns]


In [41]:
original_data.drop(columns=['Unnamed: 0'], inplace=True)

In [42]:
original_data.set_index('orf', inplace=True)

In [43]:
cols = ['_'.join(c.split("_")[0:-1]) for c in original_data.columns]

In [44]:
original_data.columns = cols

In [45]:
original_data = original_data.T

In [46]:
original_data = original_data.groupby(original_data.index.values).mean().T

In [47]:
original_data.head()

Unnamed: 0_level_0,DMSO,Epo_max,Epo_med,Epo_min,Flu_max,Flu_med,Flu_min,Pen_max,Pen_med,Pen_min,Pro_max,Pro_med,Pro_min,Teb_max,Teb_med,Teb_min
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
YAL001C,234.333333,269.0,271.666667,331.0,248.666667,273.666667,224.666667,292.333333,298.0,287.0,267.666667,322.333333,273.333333,261.0,251.0,271.666667
YAL003W,331.666667,387.666667,443.0,388.333333,342.0,422.333333,333.0,361.333333,361.666667,364.0,368.666667,348.0,463.666667,296.666667,385.666667,389.0
YAL033W,167.0,69.0,187.666667,194.0,209.666667,184.333333,129.333333,127.666667,119.666667,122.333333,190.666667,196.333333,108.0,176.333333,93.666667,168.666667
YAL038W,163.333333,138.333333,137.0,150.666667,141.666667,132.0,151.666667,152.666667,146.666667,158.333333,138.333333,109.0,153.333333,146.666667,167.666667,131.0
YAL043C,121.333333,99.0,136.0,103.333333,100.666667,87.333333,136.0,119.666667,120.666667,119.666667,93.333333,113.333333,118.0,124.0,109.0,126.666667


In [48]:
original_data.shape

(3766, 16)

In [49]:
original_data = original_data.groupby(original_data.index.values).mean()

In [50]:
original_data.shape

(3748, 16)

In [51]:
orfs_essential = original_data.index.values[is_essential(original_data.index.values)]
orfs_nonessential = original_data.index.values[~is_essential(original_data.index.values)]

Loading SGD features from 2017-04-03.
Loading SGD features from 2017-04-03.


In [52]:
original_data1 = original_data.loc[orfs_nonessential,:].copy()
original_data2 = original_data.loc[orfs_essential,:].copy()

In [53]:
for c in original_data1.columns:
    if not 'DMSO' in c:
        original_data1[c] = original_data1[c] / original_data1['DMSO']

In [54]:
for c in original_data2.columns:
    if not 'DMSO' in c:
        original_data2[c] = original_data2[c] / original_data2['DMSO']

In [55]:
original_data = original_data1.join(original_data2, how='outer', lsuffix='_noness', rsuffix='_ess')

In [56]:
original_data.shape

(3748, 32)

In [57]:
original_data.head()

Unnamed: 0,DMSO_noness,Epo_max_noness,Epo_med_noness,Epo_min_noness,Flu_max_noness,Flu_med_noness,Flu_min_noness,Pen_max_noness,Pen_med_noness,Pen_min_noness,...,Flu_min_ess,Pen_max_ess,Pen_med_ess,Pen_min_ess,Pro_max_ess,Pro_med_ess,Pro_min_ess,Teb_max_ess,Teb_med_ess,Teb_min_ess
YAL001C,,,,,,,,,,,...,0.958748,1.247511,1.271693,1.224751,1.142248,1.375533,1.16643,1.113798,1.071124,1.159317
YAL003W,,,,,,,,,,,...,1.00402,1.089447,1.090452,1.097487,1.111558,1.049246,1.39799,0.894472,1.162814,1.172864
YAL024C,137.666667,1.099274,0.789346,0.79661,0.825666,1.094431,0.876513,0.905569,0.978208,0.859564,...,,,,,,,,,,
YAL033W,,,,,,,,,,,...,0.774451,0.764471,0.716567,0.732535,1.141717,1.175649,0.646707,1.055888,0.560878,1.00998
YAL034C,37.333333,0.705357,0.544643,0.410714,0.267857,0.5625,0.571429,0.732143,0.678571,0.75,...,,,,,,,,,,


# Prepare the final dataset

In [70]:
data = original_data.copy()

In [71]:
original_data.columns

Index(['DMSO_noness', 'Epo_max_noness', 'Epo_med_noness', 'Epo_min_noness',
       'Flu_max_noness', 'Flu_med_noness', 'Flu_min_noness', 'Pen_max_noness',
       'Pen_med_noness', 'Pen_min_noness', 'Pro_max_noness', 'Pro_med_noness',
       'Pro_min_noness', 'Teb_max_noness', 'Teb_med_noness', 'Teb_min_noness',
       'DMSO_ess', 'Epo_max_ess', 'Epo_med_ess', 'Epo_min_ess', 'Flu_max_ess',
       'Flu_med_ess', 'Flu_min_ess', 'Pen_max_ess', 'Pen_med_ess',
       'Pen_min_ess', 'Pro_max_ess', 'Pro_med_ess', 'Pro_min_ess',
       'Teb_max_ess', 'Teb_med_ess', 'Teb_min_ess'],
      dtype='object')

In [72]:
dataset_ids_noness = [21931, 21886, 21921, 21922, 21885, 21927, 21928, 21883, 21923, 21924, 21882, 21925, 21926, 21884, 21929, 21930]
dataset_ids_ess = [21934, 21937, 21936, 21935, 21940, 21938, 21939, 21943, 21942, 21941, 21946, 21945, 21944, 21949, 21948, 21947]
dataset_ids = dataset_ids_noness + dataset_ids_ess
datasets = datasets.reindex(index=dataset_ids)

In [73]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [74]:
data.head()

dataset_id,21931,21886,21921,21922,21885,21927,21928,21883,21923,21924,...,21939,21943,21942,21941,21946,21945,21944,21949,21948,21947
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
YAL001C,,,,,,,,,,,...,0.958748,1.247511,1.271693,1.224751,1.142248,1.375533,1.16643,1.113798,1.071124,1.159317
YAL003W,,,,,,,,,,,...,1.00402,1.089447,1.090452,1.097487,1.111558,1.049246,1.39799,0.894472,1.162814,1.172864
YAL024C,137.666667,1.099274,0.789346,0.79661,0.825666,1.094431,0.876513,0.905569,0.978208,0.859564,...,,,,,,,,,,
YAL033W,,,,,,,,,,,...,0.774451,0.764471,0.716567,0.732535,1.141717,1.175649,0.646707,1.055888,0.560878,1.00998
YAL034C,37.333333,0.705357,0.544643,0.410714,0.267857,0.5625,0.571429,0.732143,0.678571,0.75,...,,,,,,,,,,


## Subset to the genes currently in SGD

In [75]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 16


In [76]:
data.head()

dataset_id,21931,21886,21921,21922,21885,21927,21928,21883,21923,21924,...,21939,21943,21942,21941,21946,21945,21944,21949,21948,21947
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
YAL001C,,,,,,,,,,,...,0.958748,1.247511,1.271693,1.224751,1.142248,1.375533,1.16643,1.113798,1.071124,1.159317
YAL003W,,,,,,,,,,,...,1.00402,1.089447,1.090452,1.097487,1.111558,1.049246,1.39799,0.894472,1.162814,1.172864
YAL024C,137.666667,1.099274,0.789346,0.79661,0.825666,1.094431,0.876513,0.905569,0.978208,0.859564,...,,,,,,,,,,
YAL033W,,,,,,,,,,,...,0.774451,0.764471,0.716567,0.732535,1.141717,1.175649,0.646707,1.055888,0.560878,1.00998
YAL034C,37.333333,0.705357,0.544643,0.410714,0.267857,0.5625,0.571429,0.732143,0.678571,0.75,...,,,,,,,,,,


In [77]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','index'])

data.head()

Unnamed: 0_level_0,dataset_id,21931,21886,21921,21922,21885,21927,21928,21883,21923,21924,...,21939,21943,21942,21941,21946,21945,21944,21949,21948,21947
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,index,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,,,,,,,,,,,...,0.958748,1.247511,1.271693,1.224751,1.142248,1.375533,1.16643,1.113798,1.071124,1.159317
3,YAL003W,,,,,,,,,,,...,1.00402,1.089447,1.090452,1.097487,1.111558,1.049246,1.39799,0.894472,1.162814,1.172864
22,YAL024C,137.666667,1.099274,0.789346,0.79661,0.825666,1.094431,0.876513,0.905569,0.978208,0.859564,...,,,,,,,,,,
31,YAL033W,,,,,,,,,,,...,0.774451,0.764471,0.716567,0.732535,1.141717,1.175649,0.646707,1.055888,0.560878,1.00998
1861,YAL034C,37.333333,0.705357,0.544643,0.410714,0.267857,0.5625,0.571429,0.732143,0.678571,0.75,...,,,,,,,,,,


In [78]:
data.shape

(3732, 32)

# Normalize

In [79]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [80]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [81]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,21931,21886,21921,21922,21885,21927,21928,21883,21923,21924,...,21939,21943,21942,21941,21946,21945,21944,21949,21948,21947
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,index,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,,,,,,,,,,,...,-0.177183,1.37192,1.277788,1.295019,0.669826,1.352345,0.625279,0.718439,0.371797,0.967555
3,YAL003W,,,,,,,,,,,...,0.035661,0.634516,0.533655,0.765739,0.544199,0.015038,1.498149,-0.131007,0.734466,1.024561
22,YAL024C,137.666667,1.099274,0.789346,0.79661,0.825666,1.094431,0.876513,0.905569,0.978208,0.859564,...,,,,,,,,,,
31,YAL033W,,,,,,,,,,,...,-1.043652,-0.881577,-1.001436,-0.752074,0.667653,0.533106,-1.333824,0.494155,-1.646411,0.339142
1861,YAL034C,37.333333,0.705357,0.544643,0.410714,0.267857,0.5625,0.571429,0.732143,0.678571,0.75,...,,,,,,,,,,


# Print out

In [82]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [83]:
from IO.save_data_to_db3 import *

In [84]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 32919014...


  0%|          | 0/32 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 32/32 [02:16<00:00,  4.27s/it]

Updating the data_modified_on field...



