In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 34663920
paper_name = 'vieitez_beltrao_2021' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/41587_2021_1051_MOESM5_ESM.xlsx', sheet_name='Table S3 – S_Scores of chemical')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 540817 x 8


In [7]:
original_data.head()

Unnamed: 0,Strain,Condition,Score,Plate,PBY ID,Mutant_type,Systematic_name,qvalue
0,PBY365__Y8205__STE20-T575A__040102,48H-20,-0.98405,P4,PBY365,phosphomutant,YHL007C,0.814834
1,PBY701__Y8205__BIM1-S172A__040103,48H-20,-0.908803,P4,PBY701,phosphomutant,YER016W,0.833546
2,PBY445__Y8205__PHO23-S264A__040104,48H-20,-0.976347,P4,PBY445,phosphomutant,YNL097C,0.814834
3,PBY879__Y8205__RPC53-S234A__040106,48H-20,-0.656309,P4,PBY879,phosphomutant,YDL150W,0.946614
4,PBY578__Y8205__VMA2-T83A__040108,48H-20,-0.47649,P4,PBY578,phosphomutant,YBR127C,0.984735


In [8]:
original_data = original_data.loc[original_data['Mutant_type']=='KO']

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 492501 x 8


In [10]:
t = pd.pivot_table(original_data, index='Systematic_name', columns='Condition', values='Score')

In [14]:
original_data = t.copy().reset_index()

In [15]:
original_data['orf'] = original_data['Systematic_name'].astype(str)

In [16]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [17]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [18]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Condition   Systematic_name    48H-20   48H-24D    48H-37    48H-39    48H-42  \
index_input                                                                     
3193               YLR287-A -2.270455  0.309799 -0.138605 -0.850785 -0.859868   

Condition     48H-5FU   48H-6AU  48H-6AU39  48H-AA  ...  72H-ETOH39  72H-GLYC  \
index_input                                         ...                         
3193         1.451824  2.819405  -0.937676     NaN  ...   -1.039763  1.220097   

Condition     72H-MAL  72H-NACL439  72H-NACL6  72H-NACL639  72H-PARA  \
index_input                                                            
3193         1.051405    -0.347562   0.395891      0.05002  1.942319   

Condition    72H-RAPA39  72H-XLGLU       orf  
index_input                                   
3193           0.238927    1.64344  YLR287-A  

[1 rows x 104 columns]


In [19]:
original_data.loc[original_data['orf']=='YLR287-A','orf'] = 'YLR287C-A'

In [20]:
original_data.set_index('orf', inplace=True)

In [21]:
original_data = original_data.groupby(original_data.index).mean()

In [22]:
original_data.shape

(4859, 102)

In [23]:
original_data.head()

Condition,48H-20,48H-24D,48H-37,48H-39,48H-42,48H-5FU,48H-6AU,48H-6AU39,48H-AA,48H-AMPHO,...,72H-CU,72H-ETOH39,72H-GLYC,72H-MAL,72H-NACL439,72H-NACL6,72H-NACL639,72H-PARA,72H-RAPA39,72H-XLGLU
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YAL002W,0.896462,-7.404396,-1.306682,-2.72889,-0.550666,-3.135267,1.623408,-4.119181,-3.720051,0.280291,...,-11.64635,-0.423683,-0.550796,1.081764,-0.805571,1.626975,-0.763178,2.139201,-2.406891,0.517362
YAL004W,-0.416342,0.035906,1.474327,0.237645,-2.37039,-1.353389,0.682456,0.518715,1.832033,0.822571,...,-1.942069,-2.335991,-1.258885,0.933605,0.390778,-1.151692,0.006402,-1.946547,-0.580464,1.591234
YAL005C,-0.685679,-1.134804,-0.151774,-1.397882,-0.621069,1.404865,1.067774,-2.056695,-0.318258,0.085912,...,0.049775,-1.989421,-0.907996,1.803635,-0.697065,-0.966572,-0.096049,-0.865883,-0.504526,0.342426
YAL007C,-0.105546,0.075773,1.284279,0.780764,-0.742151,-0.567071,-1.481874,-0.206407,0.237673,0.208001,...,-2.292795,-0.341254,-2.122047,1.120219,0.508619,0.966392,-0.012325,0.137126,0.428717,0.124757
YAL008W,-0.271289,1.448353,0.097858,0.98425,0.564519,-0.352081,-0.139841,0.364442,0.07497,1.089034,...,0.931726,0.883637,-2.466913,-1.427905,0.863191,-0.328052,-0.769127,0.23491,1.224057,-0.873417


# Prepare the final dataset

In [52]:
data = original_data.copy()

In [58]:
cond2dt = pd.read_csv('raw_data/condition_2_dataset.txt', sep='\t')
print(cond2dt.shape)

(102, 3)


In [59]:
cond2dt['condition_id_fix'] = cond2dt['condition_id_fix'].str.upper()

In [60]:
cond2dt.set_index('condition_id_fix', inplace=True)

In [61]:
cond2dt = cond2dt.reindex(index=data.columns)

In [63]:
cond2dt.loc[cond2dt['dataset_id'].isnull()]

Unnamed: 0_level_0,Unnamed: 0,dataset_id
Condition,Unnamed: 1_level_1,Unnamed: 2_level_1


In [64]:
cond2dt.head()

Unnamed: 0_level_0,Unnamed: 0,dataset_id
Condition,Unnamed: 1_level_1,Unnamed: 2_level_1
48H-20,1,22111
48H-24D,0,22094
48H-37,2,22112
48H-39,3,22113
48H-42,5,22115


In [65]:
dataset_ids = cond2dt['dataset_id'].values
datasets = datasets.reindex(index=dataset_ids)

In [66]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [67]:
data.head()

dataset_id,22111,22094,22112,22113,22115,22157,22116,22117,22119,22121,...,22143,22150,22163,22173,22201,22203,22179,22186,22195,22159
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL002W,0.896462,-7.404396,-1.306682,-2.72889,-0.550666,-3.135267,1.623408,-4.119181,-3.720051,0.280291,...,-11.64635,-0.423683,-0.550796,1.081764,-0.805571,1.626975,-0.763178,2.139201,-2.406891,0.517362
YAL004W,-0.416342,0.035906,1.474327,0.237645,-2.37039,-1.353389,0.682456,0.518715,1.832033,0.822571,...,-1.942069,-2.335991,-1.258885,0.933605,0.390778,-1.151692,0.006402,-1.946547,-0.580464,1.591234
YAL005C,-0.685679,-1.134804,-0.151774,-1.397882,-0.621069,1.404865,1.067774,-2.056695,-0.318258,0.085912,...,0.049775,-1.989421,-0.907996,1.803635,-0.697065,-0.966572,-0.096049,-0.865883,-0.504526,0.342426
YAL007C,-0.105546,0.075773,1.284279,0.780764,-0.742151,-0.567071,-1.481874,-0.206407,0.237673,0.208001,...,-2.292795,-0.341254,-2.122047,1.120219,0.508619,0.966392,-0.012325,0.137126,0.428717,0.124757
YAL008W,-0.271289,1.448353,0.097858,0.98425,0.564519,-0.352081,-0.139841,0.364442,0.07497,1.089034,...,0.931726,0.883637,-2.466913,-1.427905,0.863191,-0.328052,-0.769127,0.23491,1.224057,-0.873417


## Subset to the genes currently in SGD

In [68]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 23


In [69]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,22111,22094,22112,22113,22115,22157,22116,22117,22119,22121,...,22143,22150,22163,22173,22201,22203,22179,22186,22195,22159
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,0.896462,-7.404396,-1.306682,-2.72889,-0.550666,-3.135267,1.623408,-4.119181,-3.720051,0.280291,...,-11.64635,-0.423683,-0.550796,1.081764,-0.805571,1.626975,-0.763178,2.139201,-2.406891,0.517362
1863,YAL004W,-0.416342,0.035906,1.474327,0.237645,-2.37039,-1.353389,0.682456,0.518715,1.832033,0.822571,...,-1.942069,-2.335991,-1.258885,0.933605,0.390778,-1.151692,0.006402,-1.946547,-0.580464,1.591234
4,YAL005C,-0.685679,-1.134804,-0.151774,-1.397882,-0.621069,1.404865,1.067774,-2.056695,-0.318258,0.085912,...,0.049775,-1.989421,-0.907996,1.803635,-0.697065,-0.966572,-0.096049,-0.865883,-0.504526,0.342426
5,YAL007C,-0.105546,0.075773,1.284279,0.780764,-0.742151,-0.567071,-1.481874,-0.206407,0.237673,0.208001,...,-2.292795,-0.341254,-2.122047,1.120219,0.508619,0.966392,-0.012325,0.137126,0.428717,0.124757
6,YAL008W,-0.271289,1.448353,0.097858,0.98425,0.564519,-0.352081,-0.139841,0.364442,0.07497,1.089034,...,0.931726,0.883637,-2.466913,-1.427905,0.863191,-0.328052,-0.769127,0.23491,1.224057,-0.873417


# Normalize

In [70]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [71]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [72]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,22111,22094,22112,22113,22115,22157,22116,22117,22119,22121,...,22143,22150,22163,22173,22201,22203,22179,22186,22195,22159
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,0.896462,-7.404396,-1.306682,-2.72889,-0.550666,-3.135267,1.623408,-4.119181,-3.720051,0.280291,...,-3.100076,-0.25836,-0.145236,0.202665,-0.846925,1.395911,-0.775045,1.423963,-1.62172,0.065825
1863,YAL004W,-0.416342,0.035906,1.474327,0.237645,-2.37039,-1.353389,0.682456,0.518715,1.832033,0.822571,...,-0.763498,-1.705237,-0.308759,0.159415,0.082678,-1.068992,-0.111843,-1.18708,-0.27843,0.515625
4,YAL005C,-0.685679,-1.134804,-0.151774,-1.397882,-0.621069,1.404865,1.067774,-2.056695,-0.318258,0.085912,...,-0.283905,-1.443018,-0.227726,0.413387,-0.762612,-0.904775,-0.200132,-0.49647,-0.22258,-0.007448
5,YAL007C,-0.105546,0.075773,1.284279,0.780764,-0.742151,-0.567071,-1.481874,-0.206407,0.237673,0.208001,...,-0.847945,-0.195994,-0.508094,0.21389,0.174245,0.809921,-0.127981,0.144514,0.463797,-0.098621
6,YAL008W,-0.271289,1.448353,0.097858,0.98425,0.564519,-0.352081,-0.139841,0.364442,0.07497,1.089034,...,-0.071551,0.730775,-0.587736,-0.529937,0.449759,-0.338356,-0.780171,0.207004,1.048749,-0.516714


# Print out

In [73]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [74]:
from IO.save_data_to_db3 import *

In [75]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/102 [00:00<?, ?it/s]

Deleting all datasets for PMID 34663920...
Inserting the new data...


100%|██████████| 102/102 [11:09<00:00,  6.56s/it]

Updating the data_modified_on field...



