In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 24926745
paper_name = 'tun_wu_2014' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [4]:
datasets.set_index('pmid', inplace=True)

# Load & process the data

In [13]:
original_data = pd.read_excel('raw_data/c4mt00116h1.xlsx', sheet_name='Sheet1', skiprows=1)

In [14]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 5088 x 5


In [15]:
original_data.head()

Unnamed: 0.1,Unnamed: 0,ORF,Control,Al 1.6 mM,Al 3.2 mM
0,,YOR061W,10.101,9.370869,6.319561
1,,YJL165C,10.8904,8.188172,4.158855
2,,YDR072C,10.0303,9.400706,3.892943
3,,YOR014W,10.2839,7.218623,3.6221
4,,YLR407W,10.8581,7.859366,3.563197


In [16]:
original_data['ORF'] = original_data['ORF'].astype(str)

In [17]:
# Eliminate all white spaces & capitalize
original_data['ORF'] = clean_orf(original_data['ORF'])

In [18]:
original_data.loc[original_data['ORF'].str.startswith('YOR205CHOMDIP'),'ORF'] = 'YOR205C'

In [19]:
# Translate to ORFs 
original_data['ORF'] = translate_sc(original_data['ORF'], to='orf')

In [20]:
# Make sure everything translated ok
t = looks_like_orf(original_data['ORF'])
print(original_data.loc[~t,])

             Unnamed: 0     ORF  Control  Al 1.6 mM  Al  3.2 mM
index_input                                                    
2086                NaN  BY4743  10.7087   6.333333    2.045045


In [22]:
original_data = original_data[['ORF','Control','Al 1.6 mM','Al  3.2 mM']].copy()

In [24]:
original_data.set_index('ORF', inplace=True)
original_data.index.name = 'orf'

In [25]:
original_data['Control'] = pd.to_numeric(data['Control'], errors='coerce')
original_data['Al 1.6 mM'] = pd.to_numeric(data['Al 1.6 mM'], errors='coerce')
original_data['Al  3.2 mM'] = pd.to_numeric(data['Al  3.2 mM'], errors='coerce')

In [26]:
original_data = original_data.div(original_data.loc['BY4743',:])

In [27]:
original_data['Al 1.6 mM'] = original_data['Al 1.6 mM'] / original_data['Control']

In [28]:
original_data['Al  3.2 mM'] = original_data['Al  3.2 mM'] / original_data['Control']

In [29]:
original_data.drop(index='BY4743', inplace=True)

In [30]:
original_data.head()

Unnamed: 0_level_0,Control,Al 1.6 mM,Al 3.2 mM
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
YOR061W,0.943255,1.568623,3.276084
YJL165C,1.016967,1.271299,1.999696
YDR072C,0.936649,1.584716,2.032349
YOR014W,0.960329,1.186867,1.844325
YLR407W,1.013949,1.223881,1.718387


In [31]:
original_data = original_data.groupby(original_data.index).mean()

In [32]:
original_data.shape

(4873, 3)

# Prepare the final dataset

In [33]:
data = original_data.copy()

In [34]:
dataset_ids = [16509,16477,16478]
datasets = datasets.reindex(index=dataset_ids)

In [35]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [36]:
data.head()

dataset_id,16509,16477,16478
data_type,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
YAL002W,0.829311,0.573854,1.109385
YAL004W,0.774919,1.678285,1.369677
YAL005C,0.822704,1.509077,1.263216
YAL007C,0.832042,1.028409,1.058241
YAL008W,0.747878,1.496867,1.21598


## Subset to the genes currently in SGD

In [37]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 23


In [38]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16509,16477,16478
Unnamed: 0_level_1,data_type,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2,YAL002W,0.829311,0.573854,1.109385
1863,YAL004W,0.774919,1.678285,1.369677
4,YAL005C,0.822704,1.509077,1.263216
5,YAL007C,0.832042,1.028409,1.058241
6,YAL008W,0.747878,1.496867,1.21598


# Normalize

In [39]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [40]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [41]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16509,16477,16478,16509,16477,16478
Unnamed: 0_level_1,data_type,value,value,value,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2,YAL002W,0.829311,0.573854,1.109385,-1.170176,-0.875988,0.105966
1863,YAL004W,0.774919,1.678285,1.369677,-1.622114,2.171384,0.752095
4,YAL005C,0.822704,1.509077,1.263216,-1.225075,1.704501,0.487825
5,YAL007C,0.832042,1.028409,1.058241,-1.147485,0.37823,-0.020991
6,YAL008W,0.747878,1.496867,1.21598,-1.846797,1.670809,0.370569


# Print out

In [42]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [43]:
from IO.save_data_to_db3 import *

In [44]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/3 [00:00<?, ?it/s]

Deleting all datasets for PMID 24926745...
Inserting the new data...


100%|██████████| 3/3 [00:23<00:00,  7.81s/it]

Updating the data_modified_on field...



