In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 33958448
paper_name = 'costanzo_andrews_2021' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/Costanzo_et_al_Data_File_1_Conditions_Strains_Fitness.xlsx', 
                              sheet_name='Diff. Mutant fitness_Conditions')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4429 x 19


In [7]:
original_data.head()

Unnamed: 0,Systematic Name,Gene Name,Allele (Essential genes only),Strain ID,on Diagnostic Array,Actinomycin D,Benomyl,Boretzeomib,Caspofungin,Concanmycin A,Cycloheximide,Fluconazole,Galactose,Geldanamycin,MMS,Monensin,Rapamycin,Sorbitol,Tunicamycin
0,YAL001C,TFC3,tfc3-g349e,tsa508,,0.005,0.138,-0.022,-0.018,0.028,-0.008,0.051,0.04,-0.025,0.029,-0.037,-0.002,-0.037,-0.024
1,YAL002W,VPS8,,dma23,,-0.489,-0.198,-0.115,-0.036,-0.132,-0.045,-0.027,-0.094,-0.067,-0.052,-0.033,-0.2965,-0.1115,-0.085
2,YAL004W,,,dma22,,-0.0245,-0.0525,-0.0145,-0.0045,-0.0535,-0.0275,-0.0145,-0.0225,0.0195,0.0025,-0.0085,0.173,0.087,-0.0005
3,YAL005C,SSA1,,dma21,,-0.031,0.01,-0.015,0.026,-0.046,-0.002,0.075,-0.025,-0.004,-0.013,-0.023,0.2015,0.1265,0.043
4,YAL007C,ERP2,,dma20,,-0.0315,0.0295,0.0215,-0.0505,0.0585,0.0205,0.0215,0.0145,0.0335,-0.0435,-0.0085,0.0835,-0.0265,-0.0105


In [8]:
# Only keep non-essential genes
original_data = original_data.loc[original_data['Strain ID'].str.startswith('dma'),:].copy()

In [9]:
original_data['orf'] = original_data['Systematic Name'].astype(str)

In [10]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [11]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [12]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

            Systematic Name Gene Name Allele (Essential genes only) Strain ID  \
index_input                                                                     
3847                YOL153C       NaN                           NaN   dma5334   
3882                YOR031W      CRS5                           NaN   dma4461   

            on Diagnostic Array  Actinomycin D  Benomyl  Boretzeomib  \
index_input                                                            
3847                        NaN        -0.0180  -0.0200       0.0070   
3882                        NaN         0.0035   0.0445      -0.0165   

             Caspofungin   Concanmycin A  Cycloheximide  Fluconazole  \
index_input                                                            
3847               0.0350        -0.0120        -0.0340      -0.0130   
3882               0.0225        -0.0095         0.0215       0.0235   

             Galactose  Geldanamycin    MMS   Monensin  Rapamycin  Sorbitol  \
index_input       

In [13]:
original_data = original_data.loc[t,:]

In [14]:
original_data.set_index('orf', inplace=True)

In [15]:
data_cols = original_data.columns.values[5:]
original_data = original_data[data_cols].copy()

In [16]:
original_data = original_data.groupby(original_data.index).mean()

In [17]:
original_data.shape

(3627, 14)

In [18]:
data_cols

array(['Actinomycin D', 'Benomyl', 'Boretzeomib', 'Caspofungin ',
       'Concanmycin A', 'Cycloheximide', 'Fluconazole', 'Galactose',
       'Geldanamycin', 'MMS ', 'Monensin', 'Rapamycin', 'Sorbitol',
       'Tunicamycin'], dtype=object)

# Prepare the final dataset

In [19]:
data = original_data.copy()

In [20]:
dataset_ids = np.arange(21889,21903)

In [21]:
datasets = datasets.reindex(index=dataset_ids)

In [22]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [23]:
data.head()

dataset_id,21889,21890,21891,21892,21893,21894,21895,21896,21897,21898,21899,21900,21901,21902
data_type,value,value,value,value,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
YAL002W,-0.489,-0.198,-0.115,-0.036,-0.132,-0.045,-0.027,-0.094,-0.067,-0.052,-0.033,-0.2965,-0.1115,-0.085
YAL004W,-0.0245,-0.0525,-0.0145,-0.0045,-0.0535,-0.0275,-0.0145,-0.0225,0.0195,0.0025,-0.0085,0.173,0.087,-0.0005
YAL005C,-0.031,0.01,-0.015,0.026,-0.046,-0.002,0.075,-0.025,-0.004,-0.013,-0.023,0.2015,0.1265,0.043
YAL007C,-0.0315,0.0295,0.0215,-0.0505,0.0585,0.0205,0.0215,0.0145,0.0335,-0.0435,-0.0085,0.0835,-0.0265,-0.0105
YAL008W,0.0195,-0.0135,-0.0275,0.0315,0.0315,0.0335,-0.0515,0.0255,0.0555,0.0005,0.0225,0.041,-0.028,-0.0225


## Subset to the genes currently in SGD

In [24]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 13


In [25]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,21889,21890,21891,21892,21893,21894,21895,21896,21897,21898,21899,21900,21901,21902
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
2,YAL002W,-0.489,-0.198,-0.115,-0.036,-0.132,-0.045,-0.027,-0.094,-0.067,-0.052,-0.033,-0.2965,-0.1115,-0.085
1863,YAL004W,-0.0245,-0.0525,-0.0145,-0.0045,-0.0535,-0.0275,-0.0145,-0.0225,0.0195,0.0025,-0.0085,0.173,0.087,-0.0005
4,YAL005C,-0.031,0.01,-0.015,0.026,-0.046,-0.002,0.075,-0.025,-0.004,-0.013,-0.023,0.2015,0.1265,0.043
5,YAL007C,-0.0315,0.0295,0.0215,-0.0505,0.0585,0.0205,0.0215,0.0145,0.0335,-0.0435,-0.0085,0.0835,-0.0265,-0.0105
6,YAL008W,0.0195,-0.0135,-0.0275,0.0315,0.0315,0.0335,-0.0515,0.0255,0.0555,0.0005,0.0225,0.041,-0.028,-0.0225


# Normalize

In [26]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [27]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [28]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,21889,21890,21891,21892,21893,21894,21895,21896,21897,21898,...,21893,21894,21895,21896,21897,21898,21899,21900,21901,21902
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,-0.489,-0.198,-0.115,-0.036,-0.132,-0.045,-0.027,-0.094,-0.067,-0.052,...,-1.480624,-1.062316,-0.423861,-1.100675,-1.253285,-0.899562,-0.877244,-3.534203,-1.905278,-1.342088
1863,YAL004W,-0.0245,-0.0525,-0.0145,-0.0045,-0.0535,-0.0275,-0.0145,-0.0225,0.0195,0.0025,...,-0.379781,-0.600236,-0.160874,-0.20437,0.432885,0.263113,-0.307792,1.795568,1.361305,0.045876
4,YAL005C,-0.031,0.01,-0.015,0.026,-0.046,-0.002,0.075,-0.025,-0.004,-0.013,...,-0.274605,0.073079,1.722116,-0.23571,-0.025207,-0.067556,-0.644815,2.1191,2.01133,0.760389
5,YAL007C,-0.0315,0.0295,0.0215,-0.0505,0.0585,0.0205,0.0215,0.0145,0.0335,-0.0435,...,1.190849,0.667181,0.59653,0.259452,0.705791,-0.718227,-0.307792,0.779562,-0.506489,-0.118381
6,YAL008W,0.0195,-0.0135,-0.0275,0.0315,0.0315,0.0335,-0.0515,0.0255,0.0555,0.0005,...,0.812215,1.01044,-0.939317,0.397345,1.134643,0.220446,0.412738,0.297102,-0.531174,-0.315488


# Print out

In [29]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [30]:
from IO.save_data_to_db3 import *

In [31]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/14 [00:00<?, ?it/s]

Deleting all datasets for PMID 33958448...
Inserting the new data...


100%|██████████| 14/14 [01:10<00:00,  5.04s/it]

Updating the data_modified_on field...



