In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 22094260
paper_name = 'skrtic_schimmer_2011' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [4]:
datasets.set_index('pmid', inplace=True)

# Load & process the data

In [27]:
original_data = pd.read_csv('raw_data/het_damp.rawsummary', sep='\t')

In [28]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 13891 x 11


In [29]:
original_data.head()

Unnamed: 0,Hybridization REF,10_11_24_YPGE_DMSO,10_11_24_YPGE_DMSO_2,10_11_24_YPGE_chloramph_0.79,10_11_24_YPGE_chloramph_0.99,10_11_24_YPGE_doxorub.12.5,10_11_24_YPGE_linezol_47.1,10_12_10_tigecyc51.5uM,10_12_10_tigecyc64.4uM,10_12_10_tigecyc80.5uM,10_12_10_tigecycDMSOctrl
0,Composite Element REF,strain::batch:tag,strain::batch:tag,strain::batch:tag,strain::batch:tag,strain::batch:tag,strain::batch:tag,strain::batch:tag,strain::batch:tag,strain::batch:tag,strain::batch:tag
1,YDL227C::ctrl_1:uptag,80.4,75.2,77.6,71,72,77.8,86,91.6,92.2,89.8
2,YDL171C::chr4_2:dntag,2044.6,1932.6,1823.8,1382.4,1464.6,1791.8,2499,3091.2,3224.2,3075.8
3,YDL172C::chr4_2:dntag,5179.4,4082.8,3080.4,2842.2,2433.8,3880.4,5492,4988,5252.2,5403.8
4,YGR044C::chr7_4:dntag,2220.2,2562.2,1553,1351,2383.4,1813.4,3595.6,3123.2,3048,2250.8


In [30]:
# First, eliminate the data for the DAMP strains
original_data = original_data.loc[~original_data['Hybridization REF'].str.contains('DAMP'),]
original_data.shape

(11883, 11)

In [31]:
# Now, extract the ORF
original_data['orf'] = original_data['Hybridization REF'].apply(lambda x: x[0:x.find(':')])

In [32]:
original_data['orf'] = original_data['orf'].astype(str)

In [33]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [34]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [35]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                   Hybridization REF 10_11_24_YPGE_DMSO 10_11_24_YPGE_DMSO_2  \
index_input                                                                    
0              Composite Element REF  strain::batch:tag    strain::batch:tag   
3224         YBR160W_AS::shawn:dntag             5171.6               5256.6   
3234         YBR160W_AS::shawn:uptag             2233.2               2511.6   

            10_11_24_YPGE_chloramph_0.79 10_11_24_YPGE_chloramph_0.99  \
index_input                                                             
0                      strain::batch:tag            strain::batch:tag   
3224                              5717.4                       5190.8   
3234                                2792                       2867.4   

            10_11_24_YPGE_doxorub.12.5 10_11_24_YPGE_linezol_47.1  \
index_input                                                         
0                    strain::batch:tag          strain::batch:tag   
3224                      

In [36]:
original_data = original_data.loc[t,]

In [37]:
original_data.set_index('orf', inplace=True)

In [38]:
for c in original_data.columns.values[1:]:
    original_data[c] = pd.to_numeric(original_data[c])

In [39]:
# Take the average of the 2 YPGE_DMSO controls
original_data['YPGE_DMSO_avg'] = original_data[['10_11_24_YPGE_DMSO','10_11_24_YPGE_DMSO_2']].mean(axis=1)

In [40]:
# Divide each treatment by its control
original_data['10_11_24_YPGE_chloramph_0.79_norm'] = original_data['10_11_24_YPGE_chloramph_0.79'] / original_data['YPGE_DMSO_avg']
original_data['10_11_24_YPGE_chloramph_0.99_norm'] = original_data['10_11_24_YPGE_chloramph_0.99'] / original_data['YPGE_DMSO_avg']
original_data['10_11_24_YPGE_doxorub.12.5_norm'] = original_data['10_11_24_YPGE_doxorub.12.5'] / original_data['YPGE_DMSO_avg']
original_data['10_11_24_YPGE_linezol_47.1_norm'] = original_data['10_11_24_YPGE_linezol_47.1'] / original_data['YPGE_DMSO_avg']

original_data['10_12_10_tigecyc51.5uM_norm'] = original_data['10_12_10_tigecyc51.5uM'] / original_data['10_12_10_tigecycDMSOctrl']
original_data['10_12_10_tigecyc64.4uM_norm'] = original_data['10_12_10_tigecyc64.4uM'] / original_data['10_12_10_tigecycDMSOctrl']
original_data['10_12_10_tigecyc80.5uM_norm'] = original_data['10_12_10_tigecyc80.5uM'] / original_data['10_12_10_tigecycDMSOctrl']


In [41]:
cols_to_keep = ['10_11_24_YPGE_chloramph_0.79_norm','10_11_24_YPGE_chloramph_0.99_norm',
                '10_11_24_YPGE_doxorub.12.5_norm',
                '10_11_24_YPGE_linezol_47.1_norm',
                '10_12_10_tigecyc51.5uM_norm','10_12_10_tigecyc64.4uM_norm','10_12_10_tigecyc80.5uM_norm']

In [42]:
original_data = original_data[cols_to_keep]

In [43]:
original_data = original_data.groupby(original_data.index).mean()

In [44]:
original_data.shape

(5895, 7)

In [45]:
original_data.head()

Unnamed: 0_level_0,10_11_24_YPGE_chloramph_0.79_norm,10_11_24_YPGE_chloramph_0.99_norm,10_11_24_YPGE_doxorub.12.5_norm,10_11_24_YPGE_linezol_47.1_norm,10_12_10_tigecyc51.5uM_norm,10_12_10_tigecyc64.4uM_norm,10_12_10_tigecyc80.5uM_norm
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
YAL001C,1.176373,1.045644,0.900349,0.862081,0.847753,0.976785,0.936862
YAL002W,0.943034,0.964386,1.204338,1.134889,1.089684,0.995914,0.95898
YAL003W,1.013223,0.973051,0.863083,0.98875,1.160143,1.051605,1.115577
YAL004W,0.925641,1.069155,0.843608,1.158093,1.063357,1.164821,1.037537
YAL005C,0.831773,0.869758,0.67364,0.75551,0.915132,0.983192,0.729415


# Prepare the final dataset

In [46]:
data = original_data.copy()

In [47]:
dataset_ids = [16572,16591,16570,16573,16571,16592,16593]
datasets = datasets.reindex(index=dataset_ids)

In [48]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [49]:
data.head()

dataset_id,16572,16591,16570,16573,16571,16592,16593
data_type,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
YAL001C,1.176373,1.045644,0.900349,0.862081,0.847753,0.976785,0.936862
YAL002W,0.943034,0.964386,1.204338,1.134889,1.089684,0.995914,0.95898
YAL003W,1.013223,0.973051,0.863083,0.98875,1.160143,1.051605,1.115577
YAL004W,0.925641,1.069155,0.843608,1.158093,1.063357,1.164821,1.037537
YAL005C,0.831773,0.869758,0.67364,0.75551,0.915132,0.983192,0.729415


## Subset to the genes currently in SGD

In [50]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [51]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16572,16591,16570,16573,16571,16592,16593
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,YAL001C,1.176373,1.045644,0.900349,0.862081,0.847753,0.976785,0.936862
2,YAL002W,0.943034,0.964386,1.204338,1.134889,1.089684,0.995914,0.95898
3,YAL003W,1.013223,0.973051,0.863083,0.98875,1.160143,1.051605,1.115577
1863,YAL004W,0.925641,1.069155,0.843608,1.158093,1.063357,1.164821,1.037537
4,YAL005C,0.831773,0.869758,0.67364,0.75551,0.915132,0.983192,0.729415


# Normalize

In [52]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [53]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [54]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16572,16591,16570,16573,16571,16592,16593,16572,16591,16570,16573,16571,16592,16593
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
1,YAL001C,1.176373,1.045644,0.900349,0.862081,0.847753,0.976785,0.936862,0.878919,0.802283,-0.165168,-0.4254,-0.608465,-0.119822,-0.216708
2,YAL002W,0.943034,0.964386,1.204338,1.134889,1.089684,0.995914,0.95898,-0.027454,0.499078,0.979614,0.42236,0.126422,-0.075134,-0.167173
3,YAL003W,1.013223,0.973051,0.863083,0.98875,1.160143,1.051605,1.115577,0.245185,0.531413,-0.305506,-0.031772,0.340446,0.054968,0.183544
1863,YAL004W,0.925641,1.069155,0.843608,1.158093,1.063357,1.164821,1.037537,-0.095016,0.890012,-0.378845,0.494469,0.046451,0.319456,0.008764
4,YAL005C,0.831773,0.869758,0.67364,0.75551,0.915132,0.983192,0.729415,-0.459632,0.145989,-1.018923,-0.756571,-0.403794,-0.104856,-0.681312


# Print out

In [55]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [56]:
from IO.save_data_to_db3 import *

In [57]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 22094260...


  0%|          | 0/7 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 7/7 [00:57<00:00,  8.22s/it]

Updating the data_modified_on field...



