In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 18245339
paper_name = 'abe_minegishi_2008' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [134]:
original_data = pd.read_excel('raw_data/Table1_Abe_Genetics.xlsx', sheet_name='Table 1 (2)', skiprows=5)

In [135]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 95 x 35


In [136]:
original_data.head()

Unnamed: 0,Amino acid biosynthesis,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34
0,,TRP1,YDR007W,,4.687151,±,0.560721,,0.253382,±,...,11.919475,±,2.550579,,●,22.582288,±,1.45379,,Phosphoribosylanthranilate isomerase
1,,TRP4,YDR354W,,4.913743,±,0.198626,,0.262297,±,...,11.938503,±,2.967158,,●,20.879572,±,7.523058,,Anthranilate phosphoribosyl transferase
2,,THR4,YCR053W,,4.227805,±,0.644339,,0.235872,±,...,12.624784,±,2.836274,,●,18.598417,±,7.17275,,Threonine synthase
3,,ARO2,YGL148W,,4.974547,±,0.139738,,0.291314,±,...,13.047383,±,1.039195,,●,25.384984,±,6.011236,,Bifunctional chorismate synthase and flavin re...
4,,ARO1,YDR127W,,4.702884,±,0.189422,,0.296969,±,...,14.082465,±,2.021527,,○,34.487548,±,3.807489,,"Arom protein, catalyzes steps 2 through 6 in t..."


In [137]:
original_data['orf'] = original_data['Unnamed: 2'].astype(str)

In [138]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [139]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [140]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                      Amino acid biosynthesis  \
index_input                                     
10                             Microautophagy   
15                     Mitochondrial function   
25           Actin organization/bud formation   
32                       Membrane trafficking   
44              Inositol phosphate metabolism   
49              Transcriptio/mRNA degradation   
64                                   Ribosome   
68                      Chromatin maintenance   
74                            Stress response   
77                              Unknown genes   
90                                        NaN   
91                                        NaN   
92                                        NaN   
93                                        NaN   
94                                        NaN   

                                                    Unnamed: 1 Unnamed: 2  \
index_input                                                                 
10          

In [141]:
original_data = original_data.loc[t,:]

In [142]:
original_data.set_index('orf', inplace=True)

In [143]:
# Data originally reported as percent relative to WT (100%). 
# So we're scaling back to fraction and shifting by 1, so that lower percentages correspond to the most negative values
original_data = original_data.iloc[:,[25,30]]/100 - 1

In [144]:
original_data = original_data.apply(pd.to_numeric, axis=1, errors='coerce')

In [145]:
original_data = original_data.groupby(original_data.index).mean()

In [146]:
original_data.shape

(80, 2)

# Load & process tested strains

In [147]:
tested = pd.read_excel('raw_data/mat_alpha_041902.xlsx', sheet_name='mat_alpha_041902.txt', skiprows=1)

In [148]:
tested.head()

Unnamed: 0.1,Unnamed: 0,record no.,ORF name,strain,batch,re-array,re-array.1,re-array.2,RG notes,QC Notes
0,,,,,,,,,,
1,1.0,10338.0,YAL068C,BY4739,01_1,101.0,A,2.0,,
2,2.0,10339.0,YAL067C,BY4739,01_1,101.0,A,3.0,,
3,3.0,10340.0,YAL066W,BY4739,01_1,101.0,A,4.0,,
4,4.0,10341.0,YAL065C,BY4739,01_1,101.0,A,5.0,,


In [149]:
tested['orf'] = tested['ORF name'].astype(str)

In [150]:
tested['orf'] = clean_orf(tested['orf'])

In [151]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [152]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

             Unnamed: 0  record no. ORF name strain batch  re-array  \
index_input                                                           
0                   NaN         NaN      NaN    NaN   NaN       NaN   
4668             4668.0         NaN      NaN    NaN   NaN       NaN   
4829             4829.0         NaN      NaN    NaN   NaN       NaN   

            re-array.1 re-array.2 RG notes QC Notes  orf  
index_input                                               
0                  NaN        NaN      NaN      NaN  NAN  
4668               NaN    end 150      NaN      NaN  NAN  
4829               NaN    end 171      NaN      NaN  NAN  


In [153]:
tested = tested.loc[t,:]

In [154]:
tested_orfs = tested['orf'].unique()

In [155]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

[]

In [156]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [157]:
data = original_data.copy()

In [158]:
dataset_ids = [537,538]
datasets = datasets.reindex(index=dataset_ids)

In [159]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [160]:
data.head()

dataset_id,537,538
data_type,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2
YAL068C,0.0,0.0
YAL067C,0.0,0.0
YAL066W,0.0,0.0
YAL065C,0.0,0.0
YAL062W,0.0,0.0


## Subset to the genes currently in SGD

In [161]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 26


In [162]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,537,538
Unnamed: 0_level_1,data_type,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
1869,YAL068C,0.0,0.0
61,YAL067C,0.0,0.0
60,YAL066W,0.0,0.0
1727,YAL065C,0.0,0.0
57,YAL062W,0.0,0.0


# Normalize

In [163]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [164]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [165]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,537,538,537,538
Unnamed: 0_level_1,data_type,value,value,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1869,YAL068C,0.0,0.0,0.009038,0.0
61,YAL067C,0.0,0.0,0.009038,0.0
60,YAL066W,0.0,0.0,0.009038,0.0
1727,YAL065C,0.0,0.0,0.009038,0.0
57,YAL062W,0.0,0.0,0.009038,0.0


# Print out

In [166]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [167]:
from IO.save_data_to_db3 import *

In [168]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/2 [00:00<?, ?it/s]

Deleting all datasets for PMID 18245339...
Inserting the new data...


100%|██████████| 2/2 [00:17<00:00,  8.86s/it]

Updating the data_modified_on field...



