In [1]:
import numpy as np
import pandas as pd

import sys

from os.path import expanduser
sys.path.append(expanduser('~') + '/Lab/Utils/Python/')

from Conversions.translate import *
from Strings.is_a import *

# Initial setup

In [61]:
paper_pmid = 24926745
paper_name = 'tun_wu_2014' 

In [62]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [63]:
datasets.set_index('pmid', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/c4mt00116h1.xlsx', sheet_name='Sheet1', skiprows=1)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 5088 x 5


In [10]:
original_data.head()

Unnamed: 0.1,Unnamed: 0,ORF,Control,Al 1.6 mM,Al 3.2 mM
0,,YOR061W,10.101,9.370869,6.319561
1,,YJL165C,10.8904,8.188172,4.158855
2,,YDR072C,10.0303,9.400706,3.892943
3,,YOR014W,10.2839,7.218623,3.6221
4,,YLR407W,10.8581,7.859366,3.563197


In [11]:
original_data['ORF'] = original_data['ORF'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['ORF'] = clean_orf(original_data['ORF'])

In [13]:
# Translate to ORFs 
original_data['ORF'] = translate_sc(original_data['ORF'], to='orf')

In [15]:
original_data.loc[original_data['ORF'].str.startswith('YOR205CHOMDIP'),'ORF'] = 'YOR205C'

In [16]:
# Make sure everything translated ok
t = looks_like_orf(original_data['ORF'])
print(original_data.loc[~t,])

             Unnamed: 0     ORF  Control  Al 1.6 mM  Al  3.2 mM
index_input                                                    
2086                NaN  BY4743  10.7087   6.333333    2.045045


In [43]:
data = original_data[['ORF','Control','Al 1.6 mM','Al  3.2 mM']].copy()

In [44]:
data.set_index('ORF', inplace=True)

In [45]:
data['Control'] = pd.to_numeric(data['Control'], errors='coerce')
data['Al 1.6 mM'] = pd.to_numeric(data['Al 1.6 mM'], errors='coerce')
data['Al  3.2 mM'] = pd.to_numeric(data['Al  3.2 mM'], errors='coerce')

In [48]:
data = data.div(data.loc['BY4743',:])

In [49]:
data['Al 1.6 mM'] = data['Al 1.6 mM'] / data['Control']

In [50]:
data['Al  3.2 mM'] = data['Al  3.2 mM'] / data['Control']

In [55]:
data.drop(index='BY4743', inplace=True)

In [54]:
data.sort_values(by='Al  3.2 mM', ascending=False).head()

Unnamed: 0_level_0,Control,Al 1.6 mM,Al 3.2 mM
ORF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
YAL016W,0.09234,9.529442,10.363998
YGL240W,0.097443,5.722825,8.226137
YHR073W,0.143808,7.111748,8.127296
YOR295W,0.103098,4.222646,7.93827
YGL064C,0.101086,8.143321,7.079032


In [56]:
data = data.groupby(data.index).mean()

In [57]:
data.shape

(4873, 3)

# Prepare the final dataset

In [58]:
dataset_ids = [16509,16477,16478]

In [64]:
datasets = datasets.reindex(index=dataset_ids)

In [66]:
data.columns = datasets['name'].values

In [67]:
# Create row index
data.index.name='orf'

In [68]:
print('Final data dimensions: %d x %d' % (data.shape))

Final data dimensions: 4873 x 3


# Print out

In [69]:
data.to_csv(paper_name + '.txt', sep='\t')

# Save to DB

In [70]:
from IO.save_data_to_db2 import *

In [71]:
# Create column index
lst = [datasets.index.values, datasets['name'].values]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','dataset_name'])
data.columns = idx

In [72]:
save_data_to_db(data, paper_pmid)