In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 25287021
paper_name = 'pereira_domingues_2014' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [7]:
files = ['10295_2014_1519_MOESM1_ESM.xlsx','10295_2014_1519_MOESM2_ESM.xlsx']
sheets = ['Table S1','Table S2']

In [12]:
original_data_list = []
for ixf, f in enumerate(files):
    original_data = pd.read_excel('raw_data/' + f, sheet_name=sheets[ixf], skiprows=9, header=None)
    print('Original data dimensions: %d x %d' % (original_data.shape))
    original_data['genes'] = original_data[0].astype(str)
    original_data['genes'] = clean_genename(original_data['genes'])
    original_data['orf'] = translate_sc(original_data['genes'], to='orf')
    t = looks_like_orf(original_data['orf'])
    print(original_data.loc[~t,])
    
    original_data = original_data.loc[t,:]
    
    original_data.loc[original_data[10].isnull(),10] = ''
    original_data['data'] = original_data[10].apply(lambda x: -len(x))
    
    original_data.set_index('orf', inplace=True)
    original_data = original_data[['data']].copy()
    
    original_data = original_data.groupby(original_data.index).mean()
    
    original_data_list.append(original_data)

Original data dimensions: 291 x 12
                                                          0    1   2   3   4  \
index_input                                                                    
7                                                       NaN  NaN NaN NaN NaN   
8                            Carbohydrate/energy metabolism  NaN NaN NaN NaN   
19                                                      NaN  NaN NaN NaN NaN   
20                                     Aminoacid metabolism  NaN NaN NaN NaN   
33                                                      NaN  NaN NaN NaN NaN   
34                                       Vitamin metabolism  NaN NaN NaN NaN   
42                                                      NaN  NaN NaN NaN NaN   
43                                    Nucleotide metabolism  NaN NaN NaN NaN   
47                                                      NaN  NaN NaN NaN NaN   
48                                            Ion transport  NaN NaN NaN NaN   
59   

In [59]:
original_data = pd.concat(original_data_list, axis=1)

In [60]:
original_data.index.name = 'orf'

In [61]:
original_data[original_data.isnull()] = 0

In [62]:
original_data.head()

Unnamed: 0_level_0,data,data
orf,Unnamed: 1_level_1,Unnamed: 2_level_1
YBR131W,-1.0,0.0
YBR147W,-2.0,-2.0
YBR171W,-1.0,-1.0
YBR173C,-2.0,-1.0
YBR176W,-1.0,0.0


In [63]:
original_data.shape

(286, 2)

# Load & process tested strains

In [64]:
tested = pd.read_excel('raw_data/chemogenomics.xlsx', sheet_name='hom.z_tdist_pval_nm.smallmol.co')

In [65]:
tested['orf'] = tested['Orf '].astype(str)

In [66]:
tested['orf'] = clean_orf(tested['orf'])

In [67]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [68]:
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

Empty DataFrame
Columns: [Orf , Gene, Essential phenotype in rich media (Giaever 2002), Slow heterozygote phenotype in rich media (Deutschbauer 2005), Slow homozygote phenotype in rich media (Deutschbauer 2005), GO terms (biological process : molecular function : cellular component), Number of unique conditions of sensitivity (p < .01) out of 145 homozygous small molecule conditions 
(yellow = MDR), Names of conditions of sensitivity 
(p < .01), Note: some of the compound names were updated (corrected or clarified) after performing this MDR analysis, causing very slight differences between this document and one generated directly from the supplemental fitness defect scores (which contain the updated names)., Unnamed: 9, Unnamed: 10, Unnamed: 11, Unnamed: 12, Unnamed: 13, Unnamed: 14, Unnamed: 15, Unnamed: 16, Unnamed: 17, Unnamed: 18, Unnamed: 19, Unnamed: 20, Unnamed: 21, Unnamed: 22, Unnamed: 23, Unnamed: 24, Unnamed: 25, Unnamed: 26, Unnamed: 27, Unnamed: 28, Unnamed: 29, Unnamed: 3

In [69]:
tested_orfs = np.unique(tested['orf'].values)

In [70]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

['YCR095W-A',
 'YGL024W',
 'YGL173C',
 'YHR141C',
 'YKR004C',
 'YMR231W',
 'YNL153C',
 'YNL170C',
 'YPL249C-A',
 'YPR141C',
 'YGL007C-A',
 'YGL188C-A',
 'YHL015W-A',
 'YNL037C',
 'YOL145C']

In [71]:
# Add the missing strains
tested_orfs = list(tested_orfs) + missing

In [72]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [73]:
data = original_data.copy()

In [74]:
dataset_ids = [751, 752]
datasets = datasets.reindex(index=dataset_ids)

In [75]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [76]:
data.head()

dataset_id,751,752
data_type,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2
YAL002W,0.0,0.0
YAL004W,0.0,0.0
YAL005C,0.0,0.0
YAL007C,0.0,0.0
YAL008W,0.0,0.0


## Subset to the genes currently in SGD

In [77]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 23


In [78]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,751,752
Unnamed: 0_level_1,data_type,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,0.0,0.0
1863,YAL004W,0.0,0.0
4,YAL005C,0.0,0.0
5,YAL007C,0.0,0.0
6,YAL008W,0.0,0.0


# Normalize

In [79]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [80]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [81]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,751,752,751,752
Unnamed: 0_level_1,data_type,value,value,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2,YAL002W,0.0,0.0,0.0,0.0
1863,YAL004W,0.0,0.0,0.0,0.0
4,YAL005C,0.0,0.0,0.0,0.0
5,YAL007C,0.0,0.0,0.0,0.0
6,YAL008W,0.0,0.0,0.0,0.0


# Print out

In [82]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [83]:
from IO.save_data_to_db3 import *

In [84]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/2 [00:00<?, ?it/s]

Deleting all datasets for PMID 25287021...
Inserting the new data...


100%|██████████| 2/2 [00:15<00:00,  7.55s/it]

Updating the data_modified_on field...



