In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 15239834
paper_name = 'hartman_tippery_2004' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [52]:
original_data = pd.read_excel('raw_data/gb-2004-5-7-r49-s7.xlsx', sheet_name='data')

In [53]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 5292 x 12


In [54]:
original_data.head()

Unnamed: 0,Plate,Loc,ORF,SGD Gene,C AUGC,50 mM HU AUGC,150 mM HU AUGC,50 mM HU / C,150 mM HU / C,No HU- Growth Index,50mM HU Growth Index,150 mM HU Growth Index
0,1,A01,,,0.0,0.0,0.0,AUGC [0] <600,AUGC [0] <600,,AUGC [0] <600,AUGC [0] <600
1,1,A02,YAL068C,0,4003.73,3900.105,2790.225,0.974118,0.696906,-0.867974,2.0089,1.29009
2,1,A03,YAL067C,SEO1,3795.605,3254.83,2415.39,0.857526,0.636365,-1.713229,0.113024,0.344657
3,1,A04,YAL066W,,3713.92,3518.945,2707.4,0.947502,0.728987,-2.044975,1.5761,1.79108
4,1,A05,YAL065C,0,4006.465,3661.55,2758.44,0.91391,0.688497,-0.856866,1.02988,1.15877


In [55]:
original_data['orf'] = original_data['ORF'].astype(str)

In [56]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [57]:
# Remove trailing "B" from ORFs
original_data['orf'] = original_data['orf'].apply(lambda x: x.strip('B') if '-' not in x else x)

In [58]:
typo_fixes = {'YJR055WC':'YJR055W','YNL089CC':'YNL089C','YNL096CC':'YNL096C','YOR298C-AB':'YOR298C-A'}
original_data['orf'] = original_data['orf'].apply(lambda x: typo_fixes[x] if x in typo_fixes.keys() else x)

In [59]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [60]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

            Plate  Loc             ORF SGD Gene      C AUGC  50 mM HU AUGC  \
index_input                                                                  
0               1  A01             NaN      NaN    0.000000       0.000000   
85              1  H02             NaN      NaN    0.000000       0.000000   
97              2  A02             NaN      NaN  102.540000       0.000000   
181             2  H02             NaN      NaN    0.275000       0.000000   
194             3  A03             NaN      NaN    0.000000       0.000000   
...           ...  ...             ...      ...         ...            ...   
5287          NaN  NaN    both WT s.d.      NaN  246.227441     300.780129   
5288          NaN  NaN             NaN      NaN         NaN            NaN   
5289          NaN  NaN      WT1 % s.d.      NaN    4.560414       7.489630   
5290          NaN  NaN      WT2 % s.d.      NaN    6.266160       8.423282   
5291          NaN  NaN  both WT % s.d.      NaN    5.838303     

In [61]:
original_data = original_data.loc[t,:]

In [62]:
original_data.set_index('orf', inplace=True)

In [63]:
original_data = original_data[['No HU- Growth Index','50mM HU Growth Index','150 mM HU Growth Index']].apply(pd.to_numeric, axis=1, errors='coerce')

In [64]:
original_data.columns = ['unt','50','150']

In [65]:
original_data['50'] = original_data['50'] - original_data['unt']
original_data['150'] = original_data['150'] - original_data['unt']

In [66]:
original_data = original_data.groupby(original_data.index).mean()

In [67]:
original_data.shape

(4757, 3)

# Prepare the final dataset

In [68]:
data = original_data.copy()

In [69]:
dataset_ids = [16186, 52, 53]
datasets = datasets.reindex(index=dataset_ids)

In [70]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [71]:
data.head()

dataset_id,16186,52,53
data_type,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
YAL002W,0.821115,-3.522394,-3.52631
YAL004W,-1.221062,2.983007,2.396385
YAL005C,-2.463795,3.998175,2.699299
YAL007C,0.986876,0.703589,1.226565
YAL008W,-0.955474,1.60133,1.745804


## Subset to the genes currently in SGD

In [72]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 23


In [73]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16186,52,53
Unnamed: 0_level_1,data_type,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2,YAL002W,0.821115,-3.522394,-3.52631
1863,YAL004W,-1.221062,2.983007,2.396385
4,YAL005C,-2.463795,3.998175,2.699299
5,YAL007C,0.986876,0.703589,1.226565
6,YAL008W,-0.955474,1.60133,1.745804


# Normalize

In [74]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [75]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [76]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16186,52,53,16186,52,53
Unnamed: 0_level_1,data_type,value,value,value,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2,YAL002W,0.821115,-3.522394,-3.52631,0.364591,-1.057966,-1.295498
1863,YAL004W,-1.221062,2.983007,2.396385,-0.265152,0.758304,0.530081
4,YAL005C,-2.463795,3.998175,2.699299,-0.648372,1.041733,0.623449
5,YAL007C,0.986876,0.703589,1.226565,0.415707,0.121904,0.169502
6,YAL008W,-0.955474,1.60133,1.745804,-0.183253,0.372548,0.329549


# Print out

In [77]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [78]:
from IO.save_data_to_db3 import *

In [79]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/3 [00:00<?, ?it/s]

Deleting all datasets for PMID 15239834...
Inserting the new data...


100%|██████████| 3/3 [00:21<00:00,  7.04s/it]

Updating the data_modified_on field...



