In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 21965291
paper_name = 'zakrzewska_smits_2011' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [28]:
original_data = pd.read_excel('raw_data/mc-E10-08-0721-s06.xlsx', sheet_name='growth rates')

In [29]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4066 x 3


In [30]:
original_data.head()

Unnamed: 0,rc>0,mu 30,mu 38
0,YAL004W,0.456596,0.402838
1,YAL005C,0.42405,0.363141
2,YAL008W,0.383094,0.344194
3,YAL010C,0.348863,0.382532
4,YAL011W,0.400872,0.441868


In [31]:
original_data['orf'] = original_data['rc>0'].astype(str)

In [32]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [33]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [34]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [rc>0, mu 30, mu 38, orf]
Index: []


In [35]:
original_data.set_index('orf', inplace=True)

In [36]:
original_data = original_data[['mu 30','mu 38']].copy()

In [37]:
for c in original_data.columns:
    original_data[c] = pd.to_numeric(original_data[c], errors='coerce')

In [38]:
original_data = original_data.groupby(original_data.index).mean()

In [39]:
original_data.shape

(4065, 2)

# Dataset 2

In [40]:
original_data2 = pd.read_excel('raw_data/mc-E10-08-0721-s06.xlsx', sheet_name='survival % 95% CI')

In [41]:
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 4066 x 24


In [42]:
original_data2.head()

Unnamed: 0,rc>0,30oC oxi,<95%,>95%,Unnamed: 4,30oC acid,<95%.1,>95%.1,Unnamed: 8,30oC heat,...,<95%.3,>95%.3,Unnamed: 16,38oC acid,<95%.4,>95%.4,Unnamed: 20,38oC heat,<95%.5,>95%.5
0,YAL004W,9.856012,7.88978,11.8222,,59.95662,58.0034,61.9099,,0.014071,...,51.5178,53.9066,,67.199106,64.8805,69.5177,,65.112744,61.0327,69.1928
1,YAL005C,12.755347,10.212,15.2987,,61.756668,60.3117,63.2016,,0.018908,...,59.326,60.3164,,67.758209,66.0676,69.4489,,68.147767,65.5855,70.71
2,YAL008W,18.098631,14.9775,21.2197,,80.905708,79.7747,82.0367,,0.021217,...,57.7527,60.2097,,110.869236,109.48,112.259,,74.234596,73.5993,74.8699
3,YAL010C,24.721095,21.9863,27.4559,,109.654126,107.86,111.448,,0.080941,...,89.7102,107.269,,150.446431,141.439,159.454,,38.701433,36.4852,40.9177
4,YAL011W,24.443096,20.7042,28.182,,73.838673,72.5728,75.1046,,0.031438,...,62.5972,69.3505,,81.689552,79.1452,84.2339,,65.713649,62.3057,69.1216


In [43]:
original_data2['orf'] = original_data2['rc>0'].astype(str)

In [44]:
# Eliminate all white spaces & capitalize
original_data2['orf'] = clean_orf(original_data2['orf'])

In [45]:
# Translate to ORFs 
original_data2['orf'] = translate_sc(original_data2['orf'], to='orf')

In [46]:
# Make sure everything translated ok
t = looks_like_orf(original_data2['orf'])
print(original_data2.loc[~t,])

Empty DataFrame
Columns: [rc>0, 30oC oxi, <95%, >95%, Unnamed: 4, 30oC acid, <95%.1, >95%.1, Unnamed: 8, 30oC heat, <95%.2, >95%.2, Unnamed: 12, 38oC oxi, <95%.3, >95%.3, Unnamed: 16, 38oC acid, <95%.4, >95%.4, Unnamed: 20, 38oC heat, <95%.5, >95%.5, orf]
Index: []

[0 rows x 25 columns]


In [47]:
original_data2.set_index('orf', inplace=True)

In [48]:
original_data2 = original_data2[['30oC oxi','30oC acid','30oC heat','38oC oxi','38oC acid','38oC heat']].copy()

In [50]:
for c in original_data2.columns:
    original_data2[c] = pd.to_numeric(original_data2[c], errors='coerce')

In [51]:
original_data2 = original_data2.groupby(original_data2.index).mean()

In [52]:
original_data2.shape

(4065, 6)

# Merge

In [53]:
original_data = original_data.join(original_data2, how='outer', lsuffix='_1', rsuffix='_2')

In [54]:
original_data.head()

Unnamed: 0_level_0,mu 30,mu 38,30oC oxi,30oC acid,30oC heat,38oC oxi,38oC acid,38oC heat
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
YAL004W,0.456596,0.402838,9.856012,59.95662,0.014071,52.712209,67.199106,65.112744
YAL005C,0.42405,0.363141,12.755347,61.756668,0.018908,59.821186,67.758209,68.147767
YAL008W,0.383094,0.344194,18.098631,80.905708,0.021217,58.981233,110.869236,74.234596
YAL010C,0.348863,0.382532,24.721095,109.654126,0.080941,98.489747,150.446431,38.701433
YAL011W,0.400872,0.441868,24.443096,73.838673,0.031438,65.973874,81.689552,65.713649


# Prepare the final dataset

In [55]:
data = original_data.copy()

In [56]:
dataset_ids = list(np.arange(16128,16136))
datasets = datasets.reindex(index=dataset_ids)

In [57]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [58]:
data.head()

dataset_id,16128,16129,16130,16131,16132,16133,16134,16135
data_type,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
YAL004W,0.456596,0.402838,9.856012,59.95662,0.014071,52.712209,67.199106,65.112744
YAL005C,0.42405,0.363141,12.755347,61.756668,0.018908,59.821186,67.758209,68.147767
YAL008W,0.383094,0.344194,18.098631,80.905708,0.021217,58.981233,110.869236,74.234596
YAL010C,0.348863,0.382532,24.721095,109.654126,0.080941,98.489747,150.446431,38.701433
YAL011W,0.400872,0.441868,24.443096,73.838673,0.031438,65.973874,81.689552,65.713649


## Subset to the genes currently in SGD

In [59]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 3


In [60]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16128,16129,16130,16131,16132,16133,16134,16135
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1863,YAL004W,0.456596,0.402838,9.856012,59.95662,0.014071,52.712209,67.199106,65.112744
4,YAL005C,0.42405,0.363141,12.755347,61.756668,0.018908,59.821186,67.758209,68.147767
6,YAL008W,0.383094,0.344194,18.098631,80.905708,0.021217,58.981233,110.869236,74.234596
8,YAL010C,0.348863,0.382532,24.721095,109.654126,0.080941,98.489747,150.446431,38.701433
9,YAL011W,0.400872,0.441868,24.443096,73.838673,0.031438,65.973874,81.689552,65.713649


# Normalize

In [61]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [62]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [63]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16128,16129,16130,16131,16132,16133,16134,16135,16128,16129,16130,16131,16132,16133,16134,16135
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
1863,YAL004W,0.456596,0.402838,9.856012,59.95662,0.014071,52.712209,67.199106,65.112744,2.024843,0.516808,-0.229673,-0.258965,0.019591,-0.277464,-0.675075,-0.308837
4,YAL005C,0.42405,0.363141,12.755347,61.756668,0.018908,59.821186,67.758209,68.147767,1.251848,0.072289,-0.028432,-0.197982,0.027009,0.001648,-0.661226,-0.219107
6,YAL008W,0.383094,0.344194,18.098631,80.905708,0.021217,58.981233,110.869236,74.234596,0.279134,-0.139876,0.342441,0.450752,0.03055,-0.03133,0.406635,-0.039153
8,YAL010C,0.348863,0.382532,24.721095,109.654126,0.080941,98.489747,150.446431,38.701433,-0.533893,0.289422,0.802101,1.424695,0.122136,1.519848,1.386963,-1.089676
9,YAL011W,0.400872,0.441868,24.443096,73.838673,0.031438,65.973874,81.689552,65.713649,0.701357,0.953851,0.782806,0.211334,0.046222,0.243214,-0.316147,-0.291071


# Print out

In [64]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [65]:
from IO.save_data_to_db3 import *

In [66]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 21965291...


  0%|          | 0/8 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 8/8 [00:46<00:00,  5.79s/it]

Updating the data_modified_on field...



