In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 19325107
paper_name = 'jonikas_schuldiner_2009' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/NIHMS201195-supplement-st1.xlsx', sheet_name='Table_S1', skiprows=1)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4563 x 19


In [10]:
original_data.head()

Unnamed: 0,ORF,Name,Localization,Description,caution_flanking_ORF_is_a_hit,caution_high_variation_in_multiple_measurements,caution_library_strain_suspicious,notes (concatenation from the notes of all the library wells containing this strain which were combined into this data),hit,number_of_measurements,average_log2_fluorescence,stdev_log2_fluorescence,p-val,average_log2_fluorescence.1,stdev_log2_fluorescence.1,p-val.1,average_log2_fluorescence.2,stdev_log2_fluorescence.2,p-val.2
0,YMR214W,SCJ1,ambiguous,One of several homologs of bacterial chaperone...,0,0,0,,1,6,4.225318,0.150919,1.723876e-41,0.025008,0.080239,0.5994497,4.238204,0.087689,2.744758e-40
1,YLR242C,ARV1,,Protein required for normal intracellular ster...,0,0,0,,1,4,4.99196,0.477557,2.886062e-35,0.905251,0.639754,1.377534e-10,4.042453,0.266554,6.187046e-26
2,YEL031W,SPF1,ER,"P-type ATPase, ion transporter of the ER membr...",0,0,0,,1,4,4.183607,0.177025,7.372473e-28,0.464356,0.09959,1.278965e-05,3.784655,0.119467,4.855611e-24
3,YJR117W,STE24,ER,Highly conserved zinc metalloprotease that fun...,1,0,0,,1,6,3.67634,0.067443,5.750513e-35,-0.030228,0.109646,0.4278406,3.729102,0.120586,1.3057189999999999e-34
4,YFL025C,BST1,ER,GPI inositol deacylase of the ER that negative...,0,0,1,"grows well on -met, no growth on -lys, no grow...",1,2,3.873617,0.00103,2.092727e-13,0.507844,0.034808,0.000562233,3.398601,0.031882,2.028541e-11


In [11]:
original_data['orf'] = original_data['ORF'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [16]:
typo_fixes = {'YOLO62C':'YOL062C','YKLO72W':'YKL072W','YOLO57W':'YOL057W','YLR287-A':'YLR287C-A'}
original_data['orf'] = original_data['orf'].apply(lambda x: typo_fixes[x] if x in typo_fixes.keys() else x)

In [17]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [18]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                ORF Name Localization  \
index_input                                                             
2477         control WT with KANr in the HIS3 locus  NaN          NaN   

            Description  caution_flanking_ORF_is_a_hit  \
index_input                                              
2477                NaN                              0   

             caution_high_variation_in_multiple_measurements  \
index_input                                                    
2477                                                       0   

             caution_library_strain_suspicious  \
index_input                                      
2477                                         1   

            notes (concatenation from the notes of all the library wells containing this strain which were combined into this data)  \
index_input                                                                                                                   

In [19]:
original_data = original_data.loc[t,:]

In [21]:
original_data['data'] = original_data['average_log2_fluorescence.2'].astype(float)

In [22]:
original_data.set_index('orf', inplace=True)

In [23]:
original_data = original_data[['data']].copy()

In [24]:
original_data = original_data.groupby(original_data.index).mean()

In [25]:
original_data.shape

(4527, 1)

# Prepare the final dataset

In [26]:
data = original_data.copy()

In [27]:
dataset_ids = [699]
datasets = datasets.reindex(index=dataset_ids)

In [28]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [29]:
data.head()

dataset_id,699
data_type,value
orf,Unnamed: 1_level_2
YAL002W,0.37851
YAL004W,-0.255485
YAL005C,-0.304026
YAL007C,0.391689
YAL008W,0.013989


## Subset to the genes currently in SGD

In [30]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 25


In [31]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,699
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,0.37851
1863,YAL004W,-0.255485
4,YAL005C,-0.304026
5,YAL007C,0.391689
6,YAL008W,0.013989


# Normalize

In [32]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [33]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [34]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,699,699
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,0.37851,0.965771
1863,YAL004W,-0.255485,-0.61061
4,YAL005C,-0.304026,-0.731304
5,YAL007C,0.391689,0.998538
6,YAL008W,0.013989,0.059416


# Print out

In [35]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [36]:
from IO.save_data_to_db3 import *

In [37]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 19325107...
Inserting the new data...


100%|██████████| 1/1 [00:09<00:00,  9.24s/it]

Updating the data_modified_on field...



