In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 34843885
paper_name = 'guan_zhang_2022' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
sheets = ['4-NQO','FA','DCA']
original_data_list = []
for s in sheets:
    original_data_list.append(pd.read_excel('raw_data/1-s2.0-S0887233321002034-mmc1.xlsx', sheet_name=s, skiprows=1))

In [6]:
print('Original data dimensions: %d x %d' % (original_data_list[0].shape))

Original data dimensions: 6060 x 19


In [7]:
original_data_list[0].head()

Unnamed: 0,Gene ORF,DMSO_1,DMSO_2,DMSO_3,NQO_IC50_1,NQO_IC50_2,NQO_IC50_3,NQO_IC20_1,NQO_IC20_2,NQO_IC20_3,NQO_IC10_1,NQO_IC10_2,NQO_IC10_3,NQO_50%IC10_1,NQO_50%IC10_2,NQO_50%IC10_3,NQO_25%IC10_1,NQO_25%IC10_2,NQO_25%IC10_3
0,YAL001C,142,109,139,26,34,20,31,42,16,17,27,20,67,66,98,134,127,154
1,YAL003W,265,150,223,26,6,20,15,16,24,17,17,24,117,79,125,240,162,229
2,YAL025C,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,YAL032C,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,YAL033W,81,70,119,30,28,39,31,21,36,16,52,25,64,39,58,64,75,118


In [8]:
for s in np.arange(3):
    original_data_list[s]['orf'] = original_data_list[s]['Gene ORF'].astype(str)
    original_data_list[s]['orf'] = clean_orf(original_data_list[s]['orf'])
    original_data_list[s]['orf'] = translate_sc(original_data_list[s]['orf'], to='orf')
    t = looks_like_orf(original_data_list[s]['orf'])
    print(original_data_list[s].loc[~t,])
    original_data_list[s].set_index('orf', inplace=True)
    original_data_list[s].drop(columns=['Gene ORF'], inplace=True)
    for c in original_data_list[s].columns:
        original_data_list[s][c] = pd.to_numeric(original_data_list[s][c], errors='coerce')

Empty DataFrame
Columns: [Gene ORF, DMSO_1, DMSO_2, DMSO_3, NQO_IC50_1, NQO_IC50_2, NQO_IC50_3, NQO_IC20_1, NQO_IC20_2, NQO_IC20_3, NQO_IC10_1, NQO_IC10_2, NQO_IC10_3, NQO_50%IC10_1, NQO_50%IC10_2, NQO_50%IC10_3, NQO_25%IC10_1, NQO_25%IC10_2, NQO_25%IC10_3, orf]
Index: []
Empty DataFrame
Columns: [Gene ORF, DMSO_1, DMSO_2, DMSO_3, FA_IC50_1, FA_IC50_2, FA_IC50_3, FA_IC20_1, FA_IC20_2, FA_IC20_3, FA_IC10_1, FA_IC10_2, FA_IC10_3, orf]
Index: []
Empty DataFrame
Columns: [Gene ORF, H2O_1, H2O_2, H2O_3, DCA_IC50_1, DCA_IC50_2, DCA_IC50_3, DCA_IC20_1, DCA_IC20_2, DCA_IC20_3, DCA_IC10_1, DCA_IC10_2, DCA_IC10_3, orf]
Index: []


In [9]:
original_data_list[0].head()

Unnamed: 0_level_0,DMSO_1,DMSO_2,DMSO_3,NQO_IC50_1,NQO_IC50_2,NQO_IC50_3,NQO_IC20_1,NQO_IC20_2,NQO_IC20_3,NQO_IC10_1,NQO_IC10_2,NQO_IC10_3,NQO_50%IC10_1,NQO_50%IC10_2,NQO_50%IC10_3,NQO_25%IC10_1,NQO_25%IC10_2,NQO_25%IC10_3
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
YAL001C,142,109,139,26,34,20,31,42,16,17,27,20,67,66,98,134,127,154
YAL003W,265,150,223,26,6,20,15,16,24,17,17,24,117,79,125,240,162,229
YAL025C,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
YAL032C,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
YAL033W,81,70,119,30,28,39,31,21,36,16,52,25,64,39,58,64,75,118


In [10]:
controls = ['DMSO','DMSO','H2O']
original_data_list2 = []
for s in np.arange(3):
    exps = ['_'.join(c.split('_')[:-1]) for c in original_data_list[s].columns]
    t = original_data_list[s].groupby(by=exps, axis=1).mean()
    
    # Remove mutants with low counts (less than 10) from any control group (DMSO/H2O)
    t = t.loc[t[controls[s]]>10,:]
    
    # Normalize by control
    t = t.div(t[controls[s]], axis=0)
    t.drop(columns=controls[s], inplace=True)
    t = t.groupby(t.index).mean()
    
    # Remove NaNs
    n = np.sum(~np.isnan(t.values) & ~np.isinf(t.values), axis=1)
    t = t.loc[n>0,:]
    t[np.isinf(t)] = np.nan
    
    original_data_list2.append(t)

In [11]:
original_data = pd.concat(original_data_list2, axis=1)

In [12]:
original_data.shape

(3815, 11)

In [13]:
original_data.head()

Unnamed: 0,NQO_25%IC10,NQO_50%IC10,NQO_IC10,NQO_IC20,NQO_IC50,FA_IC10,FA_IC20,FA_IC50,DCA_IC10,DCA_IC20,DCA_IC50
YAL001C,1.064103,0.592308,0.164103,0.228205,0.205128,1.683616,0.988701,0.932203,1.06383,0.834515,0.87234
YAL003W,0.989028,0.503135,0.090909,0.086207,0.081505,1.383971,0.962919,0.869617,1.088398,0.911602,0.588398
YAL024C,0.720812,0.324873,0.096447,0.111675,0.060914,1.570815,1.103004,1.540773,0.6,0.426415,0.316981
YAL033W,0.951852,0.596296,0.344444,0.325926,0.359259,1.365079,0.857143,0.828042,0.989011,1.040293,0.798535
YAL037C-A,0.916914,0.724036,0.436202,0.602374,0.341246,1.721823,1.057554,0.839329,1.063253,0.855422,0.777108


In [14]:
original_data.rename_axis(index='orf', inplace=True)

In [20]:
# Remove essential genes
original_data['ess'] = is_essential(original_data.index.values).values

In [22]:
original_data = original_data.loc[~original_data['ess']]

In [25]:
original_data.drop(columns=['ess'], inplace=True)

In [23]:
original_data.shape

(3151, 12)

# Prepare the final dataset

In [26]:
data = original_data.copy()

In [27]:
dataset_ids = [22101, 22100, 22099, 22098, 22097, 22105, 22104, 22103, 22109, 22108, 22107]
datasets = datasets.reindex(index=dataset_ids)

In [28]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [29]:
data.head()

dataset_id,22101,22100,22099,22098,22097,22105,22104,22103,22109,22108,22107
data_type,value,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
YAL024C,0.720812,0.324873,0.096447,0.111675,0.060914,1.570815,1.103004,1.540773,0.6,0.426415,0.316981
YAL037C-A,0.916914,0.724036,0.436202,0.602374,0.341246,1.721823,1.057554,0.839329,1.063253,0.855422,0.777108
YAL051W,1.028169,1.140845,3.633803,5.28169,6.253521,1.5,1.484375,2.359375,1.352941,0.882353,0.529412
YAL064C-A,1.194986,0.888579,1.445682,1.802228,1.626741,1.479695,0.992386,1.28934,1.014205,1.164773,0.852273
YAR035C-A,1.382688,0.984055,1.382688,1.947608,1.494305,1.266026,1.049679,1.203526,0.929791,0.825427,0.791271


## Subset to the genes currently in SGD

In [30]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 16


In [31]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,22101,22100,22099,22098,22097,22105,22104,22103,22109,22108,22107
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
22,YAL024C,0.720812,0.324873,0.096447,0.111675,0.060914,1.570815,1.103004,1.540773,0.6,0.426415,0.316981
6451,YAL037C-A,0.916914,0.724036,0.436202,0.602374,0.341246,1.721823,1.057554,0.839329,1.063253,0.855422,0.777108
48,YAL051W,1.028169,1.140845,3.633803,5.28169,6.253521,1.5,1.484375,2.359375,1.352941,0.882353,0.529412
1867,YAL064C-A,1.194986,0.888579,1.445682,1.802228,1.626741,1.479695,0.992386,1.28934,1.014205,1.164773,0.852273
6314,YAR035C-A,1.382688,0.984055,1.382688,1.947608,1.494305,1.266026,1.049679,1.203526,0.929791,0.825427,0.791271


# Normalize

In [32]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [33]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [34]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,22101,22100,22099,22098,22097,22105,22104,22103,22109,22108,...,22100,22099,22098,22097,22105,22104,22103,22109,22108,22107
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
22,YAL024C,0.720812,0.324873,0.096447,0.111675,0.060914,1.570815,1.103004,1.540773,0.6,0.426415,...,-0.78915,-0.231624,-0.191234,-0.152706,0.63638,0.227921,0.896317,-1.713594,-2.281957,-1.970309
6451,YAL037C-A,0.916914,0.724036,0.436202,0.602374,0.341246,1.721823,1.057554,0.839329,1.063253,0.855422,...,0.039437,0.06435,0.093828,0.005002,1.085214,0.066864,-0.260395,-0.114697,-0.65203,-0.344705
48,YAL051W,1.028169,1.140845,3.633803,5.28169,6.253521,1.5,1.484375,2.359375,1.352941,0.882353,...,0.904655,2.849908,2.812186,3.331081,0.425897,1.579344,2.246229,0.885149,-0.54971,-1.219804
1867,YAL064C-A,1.194986,0.888579,1.445682,1.802228,1.626741,1.479695,0.992386,1.28934,1.014205,1.164773,...,0.380999,0.943749,0.79086,0.728185,0.365546,-0.164066,0.481694,-0.283985,0.52329,-0.079154
6314,YAR035C-A,1.382688,0.984055,1.382688,1.947608,1.494305,1.266026,1.049679,1.203526,0.929791,0.825427,...,0.579188,0.888872,0.875316,0.65368,-0.269538,0.03896,0.340182,-0.575334,-0.765989,-0.294668


In [35]:
data_all.shape

(3135, 22)

# Print out

In [36]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [39]:
sys.path.append('/Users/abaryshnikova/Lab/Utils/Python/')
from IO.save_data_to_db3 import *

In [40]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/11 [00:00<?, ?it/s]

Deleting all datasets for PMID 34843885...
Inserting the new data...


100%|██████████| 11/11 [00:41<00:00,  3.82s/it]

Updating the data_modified_on field...



