In [1]:
%run ../../Utils/yp_utils.py

import itertools

# Initial setup

In [2]:
paper_pmid = 26456335
paper_name = 'mccormick_kennedy_2015' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/rls-summary-for-Anastasia-Baryshnikova-all-BY-haploid-deletion-YPD-30C-mm042018.xlsx', 
                            sheet_name='rls')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 15094 x 32


In [7]:
# Fix a typo
original_data.loc[original_data['set_background']=='BY4,742','set_background'] = 'BY4742'
original_data['set_mating_type'] = original_data['set_mating_type'].str.lower()

In [8]:
# Only keep BY4742 (systematic screen)
original_data = original_data.loc[original_data['set_background'] == 'BY4742',:]

In [9]:
# Only keep single mutants
original_data['set_genotype'] = original_data['set_genotype'].str.strip()
original_data = original_data.loc[~original_data['set_genotype'].str.contains(' '),:]

In [10]:
original_data.head()

Unnamed: 0,id,experiments,set_name,set_strain,set_background,set_mating_type,set_locus_tag,set_genotype,set_media,set_temperature,...,ref_temperature,ref_lifespan_start_count,ref_lifespan_count,ref_lifespan_mean,ref_lifespan_stdev,ref_lifespans,percent_change,ranksum_u,ranksum_p,pooled_by
1,13219,973,vma21,BW921,BY4742,matalpha,,vma21,YPD,30,...,30,40,20,11.95,10.30827,114120225223144121377771122020,-56.2994,60.5,0.168289,file
2,13220,973,vma2,JS352,BY4742,matalpha,,vma2,YPD,30,...,30,40,20,11.95,10.30827,114120225223144121377771122020,-71.03315,70.5,0.026303,file
6,23734,66,fob1::URA,JO204,BY4742,matalpha,,fob1,YPD,30,...,30,42,42,28.38095,10.14517,"1,18,23,24,31,29,38,36,38,28,41,38,12,25,31,28...",22.39933,1168.5,0.009875,file
16,3200,498,hap4::KanMX,GS163,BY4742,matalpha,,hap4,YPD,30,...,30,20,17,16.94118,8.721171,1242321191023272231819262610124,-22.33187,125.5,0.259509,file
20,3213,498,rip1::KanMX,GS1391,BY4742,matalpha,,rip1,YPD,30,...,30,20,17,16.94118,8.721171,1242321191023272231819262610124,-39.79167,72.0,0.035095,file


In [11]:
original_data['genes'] = original_data['set_genotype'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['genes'] = clean_genename(original_data['genes'])

In [13]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['genes'], to='orf')

In [14]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,['set_name','genes','orf']])

                        set_name    genes     orf
index_input                                      
399                      YPL220W    RPL1A   RPL1A
1183                     YBR072W    HSP26   HSP26
1223                     YMR251W     GTO3    GTO3
1703                     YPR045C     MNI2    MNI2
3023                      rpl20b   RPL20B  RPL20B
...                          ...      ...     ...
14539                    YCL075W  YCL075W     NaN
14553                    YCL074W  YCL074W     NaN
14570                    YNR065C     YSN1     NaN
14613                    YCL006C  YCL006C     NaN
14945        BY4742 ybr255::HIS3   YBR255     NaN

[67 rows x 3 columns]


In [15]:
original_data.loc[~t,'orf'] = original_data.loc[~t,'set_name']

In [16]:
manual_fixes = {'rpl20b':'YOR312C','sus1':'YBR111W-A','afg3::KanMX':'YER017C',
                'tor1':'YJR066W','pph22':'YDL188C','rpn4':'YDL020C',
                'scp1':'YOR367W','por1':'YNL055C','pmt3':'YOR321W','sir2':'YDL042C',
                'dbp3':'YGL078C','ymr226c': 'YMR226C'}

for typo in manual_fixes.keys():
    original_data.loc[original_data['orf']==typo,'orf'] = manual_fixes[typo]

In [17]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,['set_name','genes','orf']])

                        set_name   genes                  orf
index_input                                                  
4302                        RAI1    RAI1                 RAI1
5142                        psgf    PSGF                 psgf
5143                        psgf    PSGF                 psgf
6376                      ST2885   ZWF13               ST2885
6377                      ST2886   ZWF14               ST2886
10136                      EMP47   EMP47                EMP47
14945        BY4742 ybr255::HIS3  YBR255  BY4742 ybr255::HIS3


In [18]:
original_data = original_data.loc[t,:]

In [19]:
original_data['rls'] = original_data['set_lifespans'].apply(lambda x: [int(t) for t in str(x).split(',')])

In [20]:
original_data['ref_rls'] = original_data['ref_lifespans'].apply(lambda x: [int(t) for t in str(x).split(',')])

In [21]:
# original_data.head()

In [22]:
# Merge all raw measurements for all replicates
all_orfs = np.unique(original_data['orf'].values)
original_data2 = pd.DataFrame(index=all_orfs, columns=['rls','ref_rls'])

In [23]:
all_orfs.shape

(4683,)

In [24]:
for orf in all_orfs:
    this = original_data.loc[original_data['orf']==orf]
    original_data2.loc[orf,'rls'] = list(itertools.chain.from_iterable(this['rls']))
    original_data2.loc[orf,'ref_rls'] = list(itertools.chain.from_iterable(this['ref_rls']))

In [25]:
original_data2['rls_num'] = original_data2['rls'].apply(lambda x: len(x))
original_data2['ref_rls_num'] = original_data2['ref_rls'].apply(lambda x: len(x))

In [26]:
original_data2['rls_mean'] = original_data2['rls'].apply(lambda x: np.nanmean(np.array(x)))
original_data2['ref_rls_mean'] = original_data2['ref_rls'].apply(lambda x: np.nanmean(np.array(x)))

In [27]:
original_data2['rls_ratio'] = original_data2['rls_mean'] / original_data2['ref_rls_mean']

In [28]:
# % Only keep data with n > 5 (rest is unreliable)
# % The authors subsequently retested these strains but the raw version of that data is
# % (unfortunately) not recoverable. Only the published results.
# % Since all but one of the published (most reliable) strains has n > 5,
# % we've decided to set the n < 5 strains to 1 (instead of NaN) to indicate that they are
# % likely neither short-lived nor long-lived (instead of "not tested")

original_data2.loc[(original_data2['rls_num']<=5) & (original_data2['ref_rls_num']<=5),'rls_ratio'] = 1

In [29]:
original_data2['data'] = original_data2['rls_ratio']

In [30]:
original_data2 = original_data2[['data']].copy()

In [31]:
original_data2.index.name='orf'

In [32]:
original_data2.shape

(4683, 1)

# Prepare the final dataset

In [34]:
data = original_data2.copy()

In [35]:
dataset_ids = [696]
datasets = datasets.reindex(index=dataset_ids)

In [36]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [37]:
data.head()

dataset_id,696
data_type,value
orf,Unnamed: 1_level_2
YAL010C,0.768248
YAL012W,1.161593
YAL016W,0.451522
YAL017W,1.048125
YAL023C,0.983755


## Subset to the genes currently in SGD

In [38]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 23


In [39]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,696
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
8,YAL010C,0.768248
10,YAL012W,1.161593
14,YAL016W,0.451522
15,YAL017W,1.048125
21,YAL023C,0.983755


# Normalize

In [40]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [41]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [42]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,696,696
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
8,YAL010C,0.768248,-1.543955
10,YAL012W,1.161593,1.054655
14,YAL016W,0.451522,-3.636391
15,YAL017W,1.048125,0.305037
21,YAL023C,0.983755,-0.120223


# Print out

In [43]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [44]:
from IO.save_data_to_db3 import *

In [45]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 26456335...
Inserting the new data...


100%|██████████| 1/1 [00:06<00:00,  6.68s/it]

Updating the data_modified_on field...



