In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 14676322
paper_name = 'warringer_blomberg_2003' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/LPI NaCl.xlsx', sheet_name='LPI', skiprows=3)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4712 x 7


In [10]:
original_data.head()

Unnamed: 0.1,Unnamed: 0,Adaptation,GrowthRate,Stationary Phase,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,YAL068C,0.082697,-0.018907,0.069628,,YAL068C,strong similarity to subtelomeric encoded prot...
1,YAL067C,-0.031704,0.046266,0.025456,,SEO1,suppressor of sulfoxyde ethionine resistance
2,YAL066W,-0.029038,-0.010681,-0.080685,,YAL066W,weak similarity to membrane protein yybF - Bac...
3,YAL065C,-0.030004,0.05143,0.004502,,YAL065C,strong similarity to Flo1p and Flo9p - putativ...
4,YAL062W,-0.033322,0.027315,-0.113903,,GDH3,NADP-glutamate dehydrogenase


In [11]:
original_data['orf'] = original_data['Unnamed: 0'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [13]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [14]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

            Unnamed: 0  Adaptation  GrowthRate  Stationary Phase  Unnamed: 4  \
index_input                                                                    
576             BY4741         NaN         NaN               NaN         NaN   

            Unnamed: 5 Unnamed: 6     orf  
index_input                                
576                NaN        NaN  BY4741  


In [15]:
original_data = original_data.loc[t,:]

In [16]:
original_data.set_index('orf', inplace=True)

In [17]:
original_data = original_data[['Adaptation','GrowthRate','Stationary Phase']].apply(pd.to_numeric, axis=1, errors='coerce')

In [18]:
original_data = original_data.groupby(original_data.index).mean()

In [19]:
original_data.shape

(4688, 3)

# Load control data

In [20]:
original_data2 = pd.read_excel('raw_data/LSC Reference.xlsx', sheet_name='LSC', skiprows=3)

In [21]:
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 4712 x 16


In [22]:
original_data2.head()

Unnamed: 0.1,Unnamed: 0,Replicate 1,Replicate 2,Average,CV,Replicate 1.1,Replicate 2.1,Average.1,CV.1,Replicate 1.2,Replicate 2.2,Average.2,CV.2,Unnamed: 13,Gene Name,Annotattion
0,YAL068C,-0.039933,0.007426,-0.016254,0.033488,-0.034754,-0.092148,-0.063451,0.040584,-0.011807,-0.151855,-0.081831,0.099029,,YAL068C,strong similarity to subtelomeric encoded prot...
1,YAL067C,-0.010405,-0.020715,-0.01556,0.007291,-0.012421,-0.045736,-0.029079,0.023557,0.015682,-0.151448,-0.067883,0.118179,,SEO1,suppressor of sulfoxyde ethionine resistance
2,YAL066W,-0.065153,0.024885,-0.020134,0.063667,0.023042,-0.005128,0.008957,0.019919,0.02785,-0.001593,0.013129,0.020819,,YAL066W,weak similarity to membrane protein yybF - Bac...
3,YAL065C,-0.05425,0.007751,-0.023249,0.043842,-0.02546,-0.039013,-0.032237,0.009583,0.009143,-0.135203,-0.06303,0.102068,,YAL065C,strong similarity to Flo1p and Flo9p - putativ...
4,YAL062W,-0.100666,-0.050238,-0.075452,0.035658,-0.011975,-0.033242,-0.022608,0.015038,0.023829,-0.112399,-0.044285,0.096328,,GDH3,NADP-glutamate dehydrogenase


In [23]:
original_data2['orf'] = original_data2['Unnamed: 0'].astype(str)

In [24]:
# Eliminate all white spaces & capitalize
original_data2['orf'] = clean_orf(original_data2['orf'])

In [25]:
# Translate to ORFs 
original_data2['orf'] = translate_sc(original_data2['orf'], to='orf')

In [26]:
# Make sure everything translated ok
t = looks_like_orf(original_data2['orf'])
print(original_data2.loc[~t,])

            Unnamed: 0  Replicate 1  Replicate 2  Average  CV  Replicate 1.1  \
index_input                                                                    
576             BY4741          NaN          NaN      NaN NaN            NaN   

             Replicate 2.1  Average.1  CV.1  Replicate 1.2  Replicate 2.2  \
index_input                                                                 
576                    NaN        NaN   NaN            NaN            NaN   

             Average.2  CV.2  Unnamed: 13 Gene Name Annotattion     orf  
index_input                                                              
576                NaN   NaN          NaN       NaN         NaN  BY4741  


In [27]:
original_data2 = original_data2.loc[t,:]

In [28]:
original_data2.set_index('orf', inplace=True)

In [31]:
original_data2 = original_data2[['Average','Average.1','Average.2']].apply(pd.to_numeric, axis=1, errors='coerce')

In [32]:
original_data2 = original_data2.groupby(original_data2.index).mean()

In [33]:
original_data2.shape

(4688, 3)

In [34]:
original_data = original_data.join(original_data2, how='outer')

# Prepare the final dataset

In [35]:
data = original_data.copy()

In [36]:
dataset_ids = [50, 49, 51, 16184, 16183, 16185]
datasets = datasets.reindex(index=dataset_ids)

In [37]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [38]:
data.head()

dataset_id,50,49,51,16184,16183,16185
data_type,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
YAL002W,-0.277892,0.100993,-0.119706,0.043742,-0.120684,-0.179924
YAL004W,0.100443,0.019982,-0.18873,0.085478,-0.016071,0.17722
YAL005C,1.098624,0.021619,-0.047312,-1.07939,0.016431,0.054458
YAL007C,0.098086,-0.066312,0.053256,0.065898,-0.004938,0.013076
YAL008W,0.030349,0.060066,0.114024,-0.01577,-0.025707,-0.057815


## Subset to the genes currently in SGD

In [39]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [40]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,50,49,51,16184,16183,16185
Unnamed: 0_level_1,data_type,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2,YAL002W,-0.277892,0.100993,-0.119706,0.043742,-0.120684,-0.179924
1863,YAL004W,0.100443,0.019982,-0.18873,0.085478,-0.016071,0.17722
4,YAL005C,1.098624,0.021619,-0.047312,-1.07939,0.016431,0.054458
5,YAL007C,0.098086,-0.066312,0.053256,0.065898,-0.004938,0.013076
6,YAL008W,0.030349,0.060066,0.114024,-0.01577,-0.025707,-0.057815


# Normalize

In [41]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [42]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [43]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,50,49,51,16184,16183,16185,50,49,51,16184,16183,16185
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2,YAL002W,-0.277892,0.100993,-0.119706,0.043742,-0.120684,-0.179924,-1.013636,0.760886,-1.173085,-0.053566,-1.104821,-1.49946
1863,YAL004W,0.100443,0.019982,-0.18873,0.085478,-0.016071,0.17722,0.571872,-0.001653,-1.634917,0.118618,-0.225317,1.73289
4,YAL005C,1.098624,0.021619,-0.047312,-1.07939,0.016431,0.054458,4.754989,0.013751,-0.688697,-4.687099,0.047933,0.621824
5,YAL007C,0.098086,-0.066312,0.053256,0.065898,-0.004938,0.013076,0.561993,-0.813926,-0.015796,0.037839,-0.131721,0.247296
6,YAL008W,0.030349,0.060066,0.114024,-0.01577,-0.025707,-0.057815,0.278126,0.375649,0.390804,-0.299088,-0.306333,-0.394306


# Print out

In [44]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [45]:
from IO.save_data_to_db3 import *

In [46]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/6 [00:00<?, ?it/s]

Deleting all datasets for PMID 14676322...
Inserting the new data...


100%|██████████| 6/6 [00:40<00:00,  6.82s/it]

Updating the data_modified_on field...



