In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [5]:
paper_pmid = 32670247
paper_name = 'johnston_strobel_2020' 

In [6]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [7]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/Table_S1.xlsx', sheet_name='Sheet1')

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 375 x 7


In [10]:
original_data.head()

Unnamed: 0,Name,ORF name,FCCP,DNP,H2SO4,HCl,NaF
0,,,,,,,
1,1. Mitochondria,,,,,,
2,AEP1,YMR064W,,x,,,
3,AFG3,YER017C,X,X,X,X,x
4,ATP7,YKL016C,X,,x,,


In [11]:
original_data['orf'] = original_data['ORF name'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [13]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [14]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                            Name ORF name FCCP  DNP H2SO4  \
index_input                                                                 
0                                            NaN      NaN  NaN  NaN   NaN   
1                                1. Mitochondria      NaN  NaN  NaN   NaN   
91                                           NaN      NaN  NaN  NaN   NaN   
92                 2. Vesicle-Mediated Transport      NaN  NaN  NaN   NaN   
125                                          NaN      NaN  NaN  NaN   NaN   
126             3. Metabolism/Nutrient Transport      NaN  NaN  NaN   NaN   
149                                          NaN      NaN  NaN  NaN   NaN   
150                           4. Ion homeostasis      NaN  NaN  NaN   NaN   
171                                          NaN      NaN  NaN  NaN   NaN   
172                                   5. Vacuole      NaN  NaN  NaN   NaN   
193                                          NaN      NaN  NaN  NaN   NaN   

In [15]:
original_data = original_data.loc[t,:]

In [16]:
original_data.set_index('orf', inplace=True)

In [17]:
data_cols = ['FCCP','DNP','H2SO4','HCl','NaF']

In [18]:
original_data = original_data[data_cols].copy()

In [22]:
data_dict = {'X': -2, 'x': -1, np.nan: 0}
for c in data_cols:
    original_data[c] = original_data[c].apply(lambda x: data_dict[x])

In [23]:
original_data.head()

Unnamed: 0_level_0,FCCP,DNP,H2SO4,HCl,NaF
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
YMR064W,0,-1,0,0,0
YER017C,-2,-2,-2,-2,-1
YKL016C,-2,0,-1,0,0
YDR377W,-1,0,0,0,0
YDR350C,-2,0,0,0,0


In [24]:
original_data = original_data.groupby(original_data.index).mean()

In [25]:
original_data.shape

(340, 5)

# Load & process tested strains

In [26]:
tested = pd.read_excel('raw_data/Matav50.xlsx', sheet_name='DATA')

In [27]:
tested.head()

Unnamed: 0,record no.,ORF name,Strain,Batch,Plate,Row,Col,Comment
0,338.0,YAL068C,BY4741,01_1,1.0,A,2.0,
1,339.0,YAL067C,BY4741,01_1,1.0,A,3.0,
2,340.0,YAL066W,BY4741,01_1,1.0,A,4.0,
3,341.0,YAL065C,BY4741,01_1,1.0,A,5.0,
4,345.0,YAL062W,BY4741,01_1,1.0,A,6.0,


In [33]:
tested['orf'] = tested['ORF name'].astype(str)

In [34]:
tested['orf'] = clean_orf(tested['orf'])

In [35]:
tested.loc[tested['orf']=='YLR287-A','orf'] = 'YLR287C-A'

In [36]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [37]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

             record no. ORF name Strain Batch  Plate  Row  Col Comment  orf
index_input                                                                
4705                NaN      NaN    NaN   NaN    NaN  NaN  NaN     NaN  NAN


In [38]:
tested = tested.loc[t,:]

In [39]:
tested_orfs = tested['orf'].unique()

In [40]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

[]

In [41]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

In [43]:
original_data.head()

Unnamed: 0_level_0,FCCP,DNP,H2SO4,HCl,NaF
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
YAL068C,0,0,0,0,0
YAL067C,0,0,0,0,0
YAL066W,0,0,0,0,0
YAL065C,0,0,0,0,0
YAL062W,0,0,0,0,0


In [44]:
# Split data into 2 screens per treatment: LOAEL (lower dose) and IC25 (higher dose)
original_data[['FCCP2','DNP2','H2SO42','HCl2','NaF2']] = original_data[['FCCP','DNP','H2SO4','HCl','NaF']]

In [51]:
low_dose_cols = ['FCCP','DNP','H2SO4','HCl','NaF']
high_dose_cols = ['FCCP2','DNP2','H2SO42','HCl2','NaF2']

t1 = original_data[low_dose_cols].mask(original_data[low_dose_cols]<0, other=-1)
t2 = original_data[high_dose_cols].mask(original_data[high_dose_cols]>-2, other=0)
t2 = t2.mask(t2<0, other=-1)

In [52]:
original_data = t1.join(t2, how='outer')

# Prepare the final dataset

In [58]:
data = original_data.copy()

In [59]:
dataset_ids = [21853, 21855, 21861, 21859, 21857, 21852, 21854, 21860, 21858, 21856]
datasets = datasets.reindex(index=dataset_ids)

In [60]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [61]:
data.head()

dataset_id,21853,21855,21861,21859,21857,21852,21854,21860,21858,21856
data_type,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
YAL068C,0,0,0,0,0,0,0,0,0,0
YAL067C,0,0,0,0,0,0,0,0,0,0
YAL066W,0,0,0,0,0,0,0,0,0,0
YAL065C,0,0,0,0,0,0,0,0,0,0
YAL062W,0,0,0,0,0,0,0,0,0,0


## Subset to the genes currently in SGD

In [62]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [63]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,21853,21855,21861,21859,21857,21852,21854,21860,21858,21856
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
1869,YAL068C,0,0,0,0,0,0,0,0,0,0
61,YAL067C,0,0,0,0,0,0,0,0,0,0
60,YAL066W,0,0,0,0,0,0,0,0,0,0
1727,YAL065C,0,0,0,0,0,0,0,0,0,0
57,YAL062W,0,0,0,0,0,0,0,0,0,0


# Normalize

In [64]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [65]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [66]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,21853,21855,21861,21859,21857,21852,21854,21860,21858,21856,21853,21855,21861,21859,21857,21852,21854,21860,21858,21856
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1869,YAL068C,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61,YAL067C,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60,YAL066W,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1727,YAL065C,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57,YAL062W,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Print out

In [67]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [68]:
from IO.save_data_to_db3 import *

In [69]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/10 [00:00<?, ?it/s]

Deleting all datasets for PMID 32670247...
Inserting the new data...


100%|██████████| 10/10 [01:23<00:00,  8.34s/it]

Updating the data_modified_on field...



