In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 20007368
paper_name = 'mclaughlin_tumer_2009' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [41]:
original_data = pd.read_excel('raw_data/0909777106_0909777106S.xlsx', sheet_name='0909777106_0909777106S', header=None)

In [42]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 16 x 11


In [43]:
original_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,BY4743,,0.08,(0.15),0.05,(0.09),0.06,(0.32),0.03,(0.31),0.22
1,YHR014W,SPO13,0.74,(0.37),0.8,(0.01),0.84,(0.11),0.64,(0.15),3.02
2,YLR193C,UPS1,0.9,(0.09),0.9,(0.07),0.6,(0.05),0.5,(0.78),2.9
3,YLR262C-A,TMA7,1.02,(0.07),0.94,(0.05),0.98,(0.01),0.46,(0.47),3.4
4,YKR010C,TOF2,0.98,(0.14),0.98,(0.04),0.71,(0.09),0.27,(0.76),2.94


In [44]:
original_data['orf'] = original_data[0].astype(str)

In [45]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [46]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [47]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                  0    1     2       3     4       5     6       7     8  \
index_input                                                                
0            BY4743  NaN  0.08  (0.15)  0.05  (0.09)  0.06  (0.32)  0.03   

                  9    10     orf  
index_input                        
0            (0.31)  0.22  BY4743  


In [48]:
original_data.set_index('orf', inplace=True)

In [49]:
original_data = original_data[[2,4,6,8]].astype(float)

In [50]:
# Normalize by WT
original_data = original_data.div(original_data.loc['BY4743',:], axis=1) - 1

In [51]:
original_data.head()

Unnamed: 0_level_0,2,4,6,8
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BY4743,0.0,0.0,0.0,0.0
YHR014W,8.25,15.0,13.0,20.333333
YLR193C,10.25,17.0,9.0,15.666667
YLR262C-A,11.75,17.8,15.333333,14.333333
YKR010C,11.25,18.6,10.833333,8.0


In [52]:
original_data.drop(index='BY4743', inplace=True)

In [53]:
original_data = original_data.groupby(original_data.index).mean()

In [54]:
original_data.shape

(15, 4)

# Prepare the final dataset

In [55]:
data = original_data.copy()

In [56]:
dataset_ids = [603, 5359, 5360, 5361]
datasets = datasets.reindex(index=dataset_ids)

In [57]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [58]:
data.head()

dataset_id,603,5359,5360,5361
data_type,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
YDL054C,12.625,18.8,12.833333,1.0
YDL173W,10.5,17.8,8.833333,-0.666667
YGL110C,10.5,16.0,10.833333,-0.333333
YGL139W,11.25,15.8,9.333333,-0.666667
YGR241C,11.0,18.6,14.0,2.0


## Subset to the genes currently in SGD

In [59]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [60]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,603,5359,5360,5361
Unnamed: 0_level_1,data_type,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1932,YDL054C,12.625,18.8,12.833333,1.0
2052,YDL173W,10.5,17.8,8.833333,-0.666667
2791,YGL110C,10.5,16.0,10.833333,-0.333333
2820,YGL139W,11.25,15.8,9.333333,-0.666667
3186,YGR241C,11.0,18.6,14.0,2.0


# Normalize

In [61]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [62]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [63]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,603,5359,5360,5361,603,5359,5360,5361
Unnamed: 0_level_1,data_type,value,value,value,value,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1932,YDL054C,12.625,18.8,12.833333,1.0,20.487298,18.991847,19.979264,2.221008
2052,YDL173W,10.5,17.8,8.833333,-0.666667,17.038941,17.981642,13.751961,-1.371799
2791,YGL110C,10.5,16.0,10.833333,-0.333333,17.038941,16.163274,16.865613,-0.653238
2820,YGL139W,11.25,15.8,9.333333,-0.666667,18.256008,15.961233,14.530374,-1.371799
3186,YGR241C,11.0,18.6,14.0,2.0,17.850319,18.789806,21.795561,4.376693


# Print out

In [64]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [65]:
from IO.save_data_to_db3 import *

In [66]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/4 [00:00<?, ?it/s]

Deleting all datasets for PMID 20007368...
Inserting the new data...


100%|██████████| 4/4 [00:00<00:00, 12.78it/s]


Updating the data_modified_on field...
