In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 29507053
paper_name = 'salignon_yvert_2018' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data (part 1)

In [5]:
# Data obtained from R by running:
# > load(file="sup_file_Met.rda")
# > write.table(tfit.summary, file="Met_tfit_summary.txt", sep='\t', quote=FALSE, row.names=FALSE)

In [6]:
# If ORF ends with a "1", strip the "1"
def remove_trailing_one(s):
    return s[:-1] if s[-1]=='1' else s

In [7]:
original_data_list = []
files = ['Salt_tfit_summary.txt','Met_tfit_summary.txt']

In [8]:
original_data_list2 = []
for f in files:
    original_data = pd.read_csv('raw_data/' + f, sep='\t')
    print('Original data dimensions: %d x %d' % (original_data.shape))
    original_data['orf'] = original_data['orf'].astype(str)
    
    # Eliminate all white spaces & capitalize
    original_data['orf'] = clean_orf(original_data['orf'])
    original_data['orf'] = original_data['orf'].apply(remove_trailing_one)
    original_data['orf'] = translate_sc(original_data['orf'], to='orf')
    
    t = looks_like_orf(original_data['orf'])
    print(original_data.loc[~t,])
    
    cols = ['orf','w.N','w.S','w.6h','w.12h','w.18h','w.24h','w.42h']
    original_data = original_data.loc[:,cols]
    original_data.set_index('orf', inplace=True)
    original_data = original_data.groupby(original_data.index).mean()
    print(original_data.shape)
    
    original_data_list2.append(original_data)

Original data dimensions: 3568 x 17
Empty DataFrame
Columns: [orf, gene, w.N, w.S, w.rat, w.exp.1, w.exp.2, w.6h, w.12h, w.18h, w.24h, w.42h, wd.6h, wd.12h, wd.18h, wd.24h, wd.42h]
Index: []
(3539, 7)
Original data dimensions: 3568 x 17
Empty DataFrame
Columns: [orf, gene, w.N, w.S, w.rat, w.exp.1, w.exp.2, w.6h, w.12h, w.18h, w.24h, w.42h, wd.6h, wd.12h, wd.18h, wd.24h, wd.42h]
Index: []
(3539, 7)


In [9]:
original_data1, original_data2 = original_data_list2

In [10]:
original_data = original_data1.join(original_data2, how='outer', lsuffix="_1", rsuffix="_2")

In [11]:
original_data.head()

Unnamed: 0_level_0,w.N_1,w.S_1,w.6h_1,w.12h_1,w.18h_1,w.24h_1,w.42h_1,w.N_2,w.S_2,w.6h_2,w.12h_2,w.18h_2,w.24h_2,w.42h_2
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
YAL001C,0.990685,0.98661,0.987395,0.993276,0.986079,0.988952,0.984499,1.03093,1.000451,1.001939,1.008323,1.006736,1.004306,1.002895
YAL002W,0.96795,0.987878,0.976784,0.986531,0.985276,0.965431,0.975929,0.98711,0.991494,0.982337,0.995215,0.97836,0.979594,0.997541
YAL004W,0.986492,0.984058,0.983449,0.980443,0.974075,0.975824,0.98245,0.962643,0.986803,0.983308,0.987761,0.982827,0.997921,0.976104
YAL005C,0.982353,0.987904,0.984966,0.982963,0.991416,0.975351,0.978536,0.978855,1.003107,0.998398,0.981444,0.999223,0.991306,1.005087
YAL007C,0.997619,0.986944,0.980337,0.982954,0.986719,0.984199,1.00896,0.984227,0.997666,0.99784,1.003035,0.99655,0.98532,0.983298


In [12]:
dataset_ids = [16167, 16168, 16169, 16175, 16176, 16177, 16178] + [16170, 16171, 16172, 16179, 16180, 16181, 16182]

# Prepare the final dataset

In [13]:
data = original_data.copy()

In [14]:
datasets = datasets.reindex(index=dataset_ids)

In [15]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [16]:
data.head()

dataset_id,16167,16168,16169,16175,16176,16177,16178,16170,16171,16172,16179,16180,16181,16182
data_type,value,value,value,value,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
YAL001C,0.990685,0.98661,0.987395,0.993276,0.986079,0.988952,0.984499,1.03093,1.000451,1.001939,1.008323,1.006736,1.004306,1.002895
YAL002W,0.96795,0.987878,0.976784,0.986531,0.985276,0.965431,0.975929,0.98711,0.991494,0.982337,0.995215,0.97836,0.979594,0.997541
YAL004W,0.986492,0.984058,0.983449,0.980443,0.974075,0.975824,0.98245,0.962643,0.986803,0.983308,0.987761,0.982827,0.997921,0.976104
YAL005C,0.982353,0.987904,0.984966,0.982963,0.991416,0.975351,0.978536,0.978855,1.003107,0.998398,0.981444,0.999223,0.991306,1.005087
YAL007C,0.997619,0.986944,0.980337,0.982954,0.986719,0.984199,1.00896,0.984227,0.997666,0.99784,1.003035,0.99655,0.98532,0.983298


## Subset to the genes currently in SGD

In [17]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 19


In [18]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

In [19]:
data.head()

Unnamed: 0_level_0,dataset_id,16167,16168,16169,16175,16176,16177,16178,16170,16171,16172,16179,16180,16181,16182
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
1,YAL001C,0.990685,0.98661,0.987395,0.993276,0.986079,0.988952,0.984499,1.03093,1.000451,1.001939,1.008323,1.006736,1.004306,1.002895
2,YAL002W,0.96795,0.987878,0.976784,0.986531,0.985276,0.965431,0.975929,0.98711,0.991494,0.982337,0.995215,0.97836,0.979594,0.997541
1863,YAL004W,0.986492,0.984058,0.983449,0.980443,0.974075,0.975824,0.98245,0.962643,0.986803,0.983308,0.987761,0.982827,0.997921,0.976104
4,YAL005C,0.982353,0.987904,0.984966,0.982963,0.991416,0.975351,0.978536,0.978855,1.003107,0.998398,0.981444,0.999223,0.991306,1.005087
5,YAL007C,0.997619,0.986944,0.980337,0.982954,0.986719,0.984199,1.00896,0.984227,0.997666,0.99784,1.003035,0.99655,0.98532,0.983298


# Normalize

In [20]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [21]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [22]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

In [23]:
data_all.head()


Unnamed: 0_level_0,dataset_id,16167,16168,16169,16175,16176,16177,16178,16170,16171,16172,...,16176,16177,16178,16170,16171,16172,16179,16180,16181,16182
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,0.990685,0.98661,0.987395,0.993276,0.986079,0.988952,0.984499,1.03093,1.000451,1.001939,...,-0.239996,-0.040559,-0.264403,1.582554,-0.049325,0.300111,0.610109,0.406135,0.401508,0.807543
2,YAL002W,0.96795,0.987878,0.976784,0.986531,0.985276,0.965431,0.975929,0.98711,0.991494,0.982337,...,-0.269175,-0.933124,-0.597513,-0.413594,-0.462921,-0.795946,-0.048942,-1.102937,-0.879579,0.59258
1863,YAL004W,0.986492,0.984058,0.983449,0.980443,0.974075,0.975824,0.98245,0.962643,0.986803,0.983308,...,-0.675877,-0.538723,-0.344024,-1.528201,-0.67956,-0.741615,-0.423694,-0.865386,0.070493,-0.267987
4,YAL005C,0.982353,0.987904,0.984966,0.982963,0.991416,0.975351,0.978536,0.978855,1.003107,0.998398,...,-0.046215,-0.556676,-0.496175,-0.789662,0.073291,0.102087,-0.741275,0.006593,-0.272412,0.895519
5,YAL007C,0.997619,0.986944,0.980337,0.982954,0.986719,0.984199,1.00896,0.984227,0.997666,0.99784,...,-0.216767,-0.220914,0.686432,-0.544955,-0.177937,0.070935,0.344256,-0.135589,-0.582743,0.020811


# Print out

In [24]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [25]:
from IO.save_data_to_db3 import *

In [26]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 29507053...
Inserting the new data...


100%|██████████| 14/14 [01:08<00:00,  4.86s/it]

Updating the data_modified_on field...



