In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 15525520
paper_name = 'pan_boeke_2004' 

In [91]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [92]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [97]:
original_data = pd.read_excel('raw_data/TableS2.xlsx', sheet_name='Table 1', skiprows=1)

In [98]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 455 x 10


In [99]:
original_data.head()

Unnamed: 0,Gene Name,Benomyl concentration (µg/ml),Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,,1.0,5.0,10.0,15.0,20.0,25.0,30.0,35.0,40.0
1,CIN1,4.28,83.68,100.61,148.13,98.2,371.48,176.94,212.63,169.98
2,YML094C-A,2.66,18.12,47.09,30.68,26.18,77.7,35.66,28.9,30.0
3,PAC10,2.63,9.28,51.57,75.01,19.63,75.57,19.73,7.28,32.89
4,PFD1,2.59,14.63,60.19,71.96,33.03,101.15,110.11,57.04,30.98


In [100]:
original_data['gene'] = original_data['Gene Name'].astype(str)

In [101]:
# Eliminate all white spaces & capitalize
original_data['gene'] = clean_genename(original_data['gene'])

In [102]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['gene'], to='orf')

In [103]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                     Gene Name  \
index_input                                                      
0                                                          NaN   
453          Notes:  Benomyl-sensitivity of each freshly ge...   
454          measured by microarray analysis of the represe...   

             Benomyl concentration (µg/ml)  Unnamed: 2  Unnamed: 3  \
index_input                                                          
0                                      1.0         5.0        10.0   
453                                    NaN         NaN         NaN   
454                                    NaN         NaN         NaN   

             Unnamed: 4  Unnamed: 5  Unnamed: 6  Unnamed: 7  Unnamed: 8  \
index_input                                                               
0                  15.0        20.0        25.0        30.0        35.0   
453                 NaN         NaN         NaN         NaN         NaN   
454               

In [104]:
original_data = original_data.loc[t,:]

In [105]:
original_data.set_index('orf', inplace=True)

In [106]:
original_data.drop(columns=['Gene Name','gene'], inplace=True)

In [107]:
# Reversing the sign to indicate that lower numbers = slower growth
original_data = -original_data.apply(pd.to_numeric, axis=1, errors='coerce')

In [108]:
original_data = original_data.groupby(original_data.index).mean()

In [109]:
original_data.shape

(447, 9)

# Load & process tested strains

In [110]:
# Microarray platform
tested = pd.read_csv('raw_data/GPL1444.txt', sep='\t', skiprows=273)

In [111]:
tested.head()

Unnamed: 0,ID,ROW,COLUMN,TAGTYPE,PROBE,ORF,GENE,SEQUENCE,SGDID,SPOT_ID
0,1,213.0,104.0,Up,ArrA,YAL001C,TFC3,ACTATATGTGAAGGCATGGC,S000000001,
1,2,211.0,104.0,Up,ArrA,YAL002W,VPS8,ATACTGACAGCACGCATGGC,S000000002,
2,3,209.0,104.0,Up,ArrA,YAL003W,EFB1,GACATATCAGCATACATGGC,S000000003,
3,4,207.0,104.0,Up,ArrA,YAL004W,YAL004W,TATGGCACGGCAGACATTCC,S000002136,
4,5,205.0,104.0,Up,ArrA,YAL005C,SSA1,AGGCATACTACACAGATTCC,S000000004,


In [112]:
# Untreated control
tested2 = pd.read_csv('raw_data/GSM30549.txt', sep='\t', skiprows=52)

In [113]:
tested2.head()

Unnamed: 0,ID_REF,F635_Median,B635_Median,F532_Median,B532_Median,F635-B635_Median,F532-B532_Median,VALUE,Flags,PRE_VALUE
0,1,418.0,52.0,456.0,69.0,366.0,387.0,0.765535,-50.0,1.7
1,2,815.0,52.0,1518.0,72.0,763.0,1446.0,-0.074001,0.0,0.95
2,3,58.0,51.0,195.0,75.0,7.0,120.0,-3.184425,-50.0,0.11
3,4,8634.0,56.0,14724.0,74.0,8578.0,14650.0,0.070389,0.0,1.05
4,5,67.0,56.0,108.0,77.0,11.0,31.0,-0.643856,-50.0,0.64


In [114]:
# Get all ORFS that have a flag == 0 (passed the basic filter)
tested = tested.merge(tested2[['ID_REF','Flags']], left_on='ID', right_on='ID_REF', how='left')

In [115]:
tested = tested.loc[tested['Flags'] > -50,:]

In [116]:
tested['orf'] = tested['ORF'].astype(str)

In [117]:
tested['orf'] = clean_orf(tested['orf'])

In [118]:
tested['orf'] = translate_sc(tested['orf'].values, to='orf')

In [119]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

          ID    ROW  COLUMN TAGTYPE PROBE  ORF GENE              SEQUENCE  \
10498  10499  202.0   102.0      Up  Rpts  NaN  NaN  TTTGTCAGTCCGCGCCCTAA   
10499  10500  124.0    64.0      Up  Rpts  NaN  NaN  TTTGTCAGTCCGCGCCCTAA   
10500  10501  186.0    50.0      Up  Rpts  NaN  NaN  TTTGTCAGTCCGCGCCCTAA   
10501  10502  137.0    45.0      Up  Rpts  NaN  NaN  TTTGTCAGTCCGCGCCCTAA   
10502  10503  144.0    10.0      Up  Rpts  NaN  NaN  TTTGTCAGTCCGCGCCCTAA   
...      ...    ...     ...     ...   ...  ...  ...                   ...   
22570  22571   18.0     1.0       ?  Edge  NaN  NaN                   NaN   
22571  22572   14.0     1.0       ?  Edge  NaN  NaN                   NaN   
22572  22573   10.0     1.0       ?  Edge  NaN  NaN                   NaN   
22573  22574    6.0     1.0       ?  Edge  NaN  NaN                   NaN   
22574  22575    2.0     1.0       ?  Edge  NaN  NaN                   NaN   

            SGDID                              SPOT_ID ID_REF  Flags  orf  

In [120]:
tested = tested.loc[t,:]

In [121]:
tested_orfs = tested['orf'].unique()

In [122]:
# Excluded strains
tested3 = pd.read_excel('raw_Data/TableS3.xlsx', sheet_name='Table 1', skiprows=1)
tested3.head()

Unnamed: 0,ORF name,plate,His+a,Mates as\nMATab,Papillates as\nMATab,Mates as\nMATab.1,Papillates as\nMATab.1
0,YAL051W,201.0,1.0,0.0,0.0,0.0,0.0
1,YAL035W,201.0,1.0,0.0,0.0,0.0,0.0
2,YAL017W,201.0,1.0,0.0,0.0,0.0,0.0
3,YAL005C,201.0,0.0,1.0,0.0,0.0,0.0
4,YAR002W,201.0,0.0,0.0,0.0,1.0,0.0


In [123]:
tested3['orf'] = tested3['ORF name'].astype(str)
tested3['orf'] = clean_orf(tested3['orf'])
tested3['orf'] = translate_sc(tested3['orf'].values, to='orf')

In [124]:
# Make sure everything translated ok
t = looks_like_orf(tested3['orf'])
print(tested3.loc[~t,])

                                              ORF name  plate  His+a  \
100  Notes:  “1” means “yes”, whereas “0” means “no...    NaN    NaN   
101  His+ was defined by the growth of the strain i...    NaN    NaN   

     Mates as\nMATab  Papillates as\nMATab  Mates as\nMATab.1  \
100              NaN                   NaN                NaN   
101              NaN                   NaN                NaN   

     Papillates as\nMATab.1                                                orf  
100                     NaN  NOTES1MEANSYESWHEREAS0MEANSNOASRESULTSOFSTRAIN...  
101                     NaN  HISWASDEFINEDBYTHEGROWTHOFTHESTRAININSYNTHETIC...  


In [125]:
tested3 = tested3.loc[t,:]

In [126]:
excluded = tested3['orf'].values

In [127]:
tested_orfs = [orf for orf in tested_orfs if orf not in excluded]

In [128]:
len(excluded)

100

In [129]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

['YBR004C',
 'YBR181C',
 'YCR071C',
 'YDL202W',
 'YDR007W',
 'YDR195W',
 'YGR076C',
 'YHL014C',
 'YHR011W',
 'YJL005W',
 'YKL139W',
 'YLR226W',
 'YNL133C',
 'YOR196C']

In [130]:
tested_orfs = tested_orfs + missing

In [131]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [132]:
data = original_data.copy()

In [133]:
dataset_ids = np.arange(5237, 5246)
datasets = datasets.reindex(index=dataset_ids)

In [135]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [136]:
data.head()

dataset_id,5237,5238,5239,5240,5241,5242,5243,5244,5245
data_type,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
YAL002W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YAL004W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YAL007C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YAL008W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YAL009W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Subset to the genes currently in SGD

In [137]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 21


In [138]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,5237,5238,5239,5240,5241,5242,5243,5244,5245
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2,YAL002W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1863,YAL004W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,YAL007C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,YAL008W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,YAL009W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Normalize

In [139]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [140]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [141]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,5237,5238,5239,5240,5241,5242,5243,5244,5245,5237,5238,5239,5240,5241,5242,5243,5244,5245
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
2,YAL002W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1863,YAL004W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,YAL007C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,YAL008W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,YAL009W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Print out

In [142]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [143]:
from IO.save_data_to_db3 import *

In [144]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/9 [00:00<?, ?it/s]

Deleting all datasets for PMID 15525520...
Inserting the new data...


100%|██████████| 9/9 [00:57<00:00,  6.41s/it]

Updating the data_modified_on field...



