In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 21978764
paper_name = 'svensson_samson_2011' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [28]:
original_data = pd.read_excel('raw_data/1752-0509-5-157-s1.xlsx', sheet_name='2. Gi50 and R2 all strains')

In [29]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 6528 x 15


In [30]:
original_data.head()

Unnamed: 0,position,orf,name,GI50,GI50rep1,r2rep1,GI50rep2,r2rep2,GI50rep3,r2rep3,GI50rep4,r2rep4,score2004,lag time (h),MMS-dep. lag diff (h/%MMS)
0,1A1,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,1A2,YAL068C,,0.013136,0.011484,0.933407,0.016368,0.911723,0.012467,0.778033,0.0,0.0,0.0,10.4,800.0
2,1A3,YAL067C,SEO1,0.014263,0.01337,0.914322,0.015285,0.857899,0.015679,0.508828,0.0,0.0,0.0,12.0,1000.0
3,1A4,YAL066W,,0.013344,0.012112,0.865901,0.016344,0.870376,0.012335,0.771413,0.0,0.0,0.0,12.0,1000.0
4,1A5,YAL065C,,0.011671,0.010683,0.928768,0.013273,0.931253,0.01135,0.868911,0.0,0.0,0.0,11.2,1200.0


In [31]:
# Remove the DAMP strains. Personal communication from Peter Svensson: deletions are on plates 1-57, DAMPs are on plates 301-311

In [32]:
import re

In [33]:
original_data['plate'] = original_data['position'].apply(lambda x: int(re.findall(r'\d+', x)[0]))

In [35]:
original_data = original_data.loc[original_data['plate'] <= 57,:]

In [36]:
original_data['orf'] = original_data['orf'].astype(str)

In [37]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [38]:
typo_fixes = {'YAR002AW':'YAR002W','YOLO57W':'YOL057W','YKLO72W':'YKL072W',
              'YJL206-A':'YJL206C','YLR287-A':'YLR287C-A','YFL033AC':'YFL033C','YOLO62C':'YOL062C'}
original_data['orf'] = original_data['orf'].apply(lambda x: typo_fixes[x] if x in typo_fixes.keys() else x)

In [39]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [40]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

            position  orf name      GI50  GI50rep1    r2rep1  GI50rep2  \
index_input                                                              
0                1A1  NAN  NaN  0.000000  0.000000  0.000000  0.000000   
22              1B11  NAN  NaN  0.010322  0.009724  0.987931  0.010997   
23              1B12  NAN  NaN  0.014909  0.010897  0.982643  0.000000   
38               1D3  NAN  NaN  0.009881  0.010232  0.876750  0.037833   
39               1D4  NAN  NaN  0.012224  0.011509  0.925862  0.013034   
...              ...  ...  ...       ...       ...       ...       ...   
5467            57H8  NAN  NaN  0.000000  0.000000  0.000000  0.000000   
5468            57H9  NAN  NaN  0.000000  0.000000  0.000000  0.000000   
5469           57H10  NAN  NaN  0.000000  0.000000  0.000000  0.000000   
5470           57H11  NAN  NaN  0.000000  0.000000  0.000000  0.000000   
5471           57H12  NAN  NaN  0.000000  0.000000  0.000000  0.000000   

               r2rep2  GI50rep3    r2

In [41]:
original_data = original_data.loc[t,:]

In [42]:
original_data['data'] = original_data['GI50']

In [43]:
original_data.set_index('orf', inplace=True)

In [44]:
original_data = original_data[['data']].copy()

In [45]:
original_data = original_data.groupby(original_data.index).mean()

In [46]:
original_data.shape

(4756, 1)

# Prepare the final dataset

In [47]:
data = original_data.copy()

In [48]:
dataset_ids = [28]
datasets = datasets.reindex(index=dataset_ids)

In [49]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [50]:
data.head()

dataset_id,28
data_type,value
orf,Unnamed: 1_level_2
YAL002W,0.011609
YAL004W,0.01186
YAL005C,0.011824
YAL007C,0.011482
YAL008W,0.010174


## Subset to the genes currently in SGD

In [51]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 26


In [52]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,28
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,0.011609
1863,YAL004W,0.01186
4,YAL005C,0.011824
5,YAL007C,0.011482
6,YAL008W,0.010174


# Normalize

In [53]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [54]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [55]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,28,28
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,0.011609,-0.057906
1863,YAL004W,0.01186,0.009507
4,YAL005C,0.011824,4.7e-05
5,YAL007C,0.011482,-0.092228
6,YAL008W,0.010174,-0.444116


# Print out

In [56]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [57]:
from IO.save_data_to_db3 import *

In [58]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 21978764...
Inserting the new data...


100%|██████████| 1/1 [00:07<00:00,  7.19s/it]

Updating the data_modified_on field...



