In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 28592509
paper_name = 'acton_giaever_2017' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/rsob160330_si_003.xlsx', sheet_name='Additional File 3')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4759 x 39


In [7]:
original_data.head()

Unnamed: 0,ORF,gene,description,YKO_ade,YKO_arg,YKO_lys,YKO_met,YKO_trp,YKO_16001701,YKO_cccp,...,PB_leu,PB_lys,PB_met,PB_trp,PB_ura,PB_16001701,PB_cccp,PB_cis,PB_fccp,PB_ypg
0,YMR035W,IMP2,Catalytic subunit of mitochondrial inner membr...,0.169486,0.135539,0.127188,0.17312,0.278047,0.180231,0.803324,...,0.472495,-0.146622,-0.041577,0.043524,-0.139166,0.131236,-0.28723,0.514325,-0.339143,-0.145031
1,YLR058C,SHM2,Cytosolic serine hydroxymethyltransferase; con...,4.015562,-0.095038,-0.047726,-0.038545,0.15672,2.075891,-0.2537,...,,,,,,,,,,
2,YGR061C,ADE6,Formylglycinamidine-ribonucleotide (FGAM)-synt...,3.783624,-0.654745,-0.441383,-0.518264,-0.337583,3.177147,-0.792467,...,,,,,,,,,,
3,YAR015W,ADE1,N-succinyl-5-aminoimidazole-4-carboxamide ribo...,3.66856,-0.716486,-0.412888,-0.956503,-0.242662,2.365885,-1.828343,...,,,,,,,,,,
4,YMR300C,ADE4,Phosphoribosylpyrophosphate amidotransferase (...,3.228467,-0.507558,-0.457965,-0.22772,-0.504614,3.160052,-0.611764,...,,,,,,2.231799,-0.560981,-0.947307,-0.902955,0.364159


In [12]:
# Drop the "_" annotations
original_data['ORF'] = original_data['ORF'].apply(lambda x: x.split('_')[0])

In [13]:
original_data['orf'] = original_data['ORF'].astype(str)

In [14]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [15]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [16]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [ORF, gene, description, YKO_ade, YKO_arg, YKO_lys, YKO_met, YKO_trp, YKO_16001701, YKO_cccp, YKO_cis, YKO_fccp, YKO_ypg, BC_ade, BC_arg, BC_his, BC_leu, BC_lys, BC_met, BC_trp, BC_ura, BC_16001701, BC_cccp, BC_cis, BC_fccp, BC_ypg, PB_ade, PB_arg, PB_his, PB_leu, PB_lys, PB_met, PB_trp, PB_ura, PB_16001701, PB_cccp, PB_cis, PB_fccp, PB_ypg, orf]
Index: []

[0 rows x 40 columns]


In [17]:
data_cols = [col for col in original_data.columns.values if col.startswith('YKO')]
data_cols

['YKO_ade',
 'YKO_arg',
 'YKO_lys',
 'YKO_met',
 'YKO_trp',
 'YKO_16001701',
 'YKO_cccp',
 'YKO_cis',
 'YKO_fccp',
 'YKO_ypg']

In [18]:
original_data.set_index('orf', inplace=True)

In [19]:
original_data = original_data[data_cols].copy()

In [20]:
original_data = original_data.groupby(original_data.index).mean()

In [21]:
original_data.shape

(4753, 10)

In [35]:
# Switching sign to follow convention (sensitive strains = negative values)
original_data = -original_data

In [36]:
original_data.head()

Unnamed: 0_level_0,YKO_ade,YKO_arg,YKO_lys,YKO_met,YKO_trp,YKO_16001701,YKO_cccp,YKO_cis,YKO_fccp,YKO_ypg
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
YAL002W,-0.192195,-0.06528,0.298186,0.217791,-0.138197,-0.081127,0.239,-0.143296,0.039372,-0.217544
YAL004W,0.010746,0.074293,0.070414,0.220205,-0.057113,-0.003672,-0.184721,0.146464,-0.159992,-0.154972
YAL005C,0.008696,0.013079,-0.012893,0.02799,0.001865,-0.042005,-0.107713,-0.160484,-0.168482,-0.208016
YAL007C,0.005854,-0.098771,-0.105308,-0.042582,-0.033003,0.099439,-0.126739,-0.039834,-0.045409,-0.075457
YAL008W,-0.01065,0.026314,0.006661,-0.050448,0.018098,-0.022211,-0.230093,-0.09755,-0.144162,-0.177283


# Prepare the final dataset

In [37]:
data = original_data.copy()

In [38]:
dataset_ids = [22009, 22010, 22011, 22012, 22013, 22014, 22015, 22016, 22017, 22018]
datasets = datasets.reindex(index=dataset_ids)

In [39]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [40]:
data.head()

dataset_id,22009,22010,22011,22012,22013,22014,22015,22016,22017,22018
data_type,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
YAL002W,-0.192195,-0.06528,0.298186,0.217791,-0.138197,-0.081127,0.239,-0.143296,0.039372,-0.217544
YAL004W,0.010746,0.074293,0.070414,0.220205,-0.057113,-0.003672,-0.184721,0.146464,-0.159992,-0.154972
YAL005C,0.008696,0.013079,-0.012893,0.02799,0.001865,-0.042005,-0.107713,-0.160484,-0.168482,-0.208016
YAL007C,0.005854,-0.098771,-0.105308,-0.042582,-0.033003,0.099439,-0.126739,-0.039834,-0.045409,-0.075457
YAL008W,-0.01065,0.026314,0.006661,-0.050448,0.018098,-0.022211,-0.230093,-0.09755,-0.144162,-0.177283


## Subset to the genes currently in SGD

In [41]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 13


In [42]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,22009,22010,22011,22012,22013,22014,22015,22016,22017,22018
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
2,YAL002W,-0.192195,-0.06528,0.298186,0.217791,-0.138197,-0.081127,0.239,-0.143296,0.039372,-0.217544
1863,YAL004W,0.010746,0.074293,0.070414,0.220205,-0.057113,-0.003672,-0.184721,0.146464,-0.159992,-0.154972
4,YAL005C,0.008696,0.013079,-0.012893,0.02799,0.001865,-0.042005,-0.107713,-0.160484,-0.168482,-0.208016
5,YAL007C,0.005854,-0.098771,-0.105308,-0.042582,-0.033003,0.099439,-0.126739,-0.039834,-0.045409,-0.075457
6,YAL008W,-0.01065,0.026314,0.006661,-0.050448,0.018098,-0.022211,-0.230093,-0.09755,-0.144162,-0.177283


# Normalize

In [43]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [44]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [45]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,22009,22010,22011,22012,22013,22014,22015,22016,22017,22018,22009,22010,22011,22012,22013,22014,22015,22016,22017,22018
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,YAL002W,-0.192195,-0.06528,0.298186,0.217791,-0.138197,-0.081127,0.239,-0.143296,0.039372,-0.217544,-0.69685,-0.306688,1.255998,1.168819,-0.417378,-0.34396,0.500325,-0.539588,0.17411,-0.50095
1863,YAL004W,0.010746,0.074293,0.070414,0.220205,-0.057113,-0.003672,-0.184721,0.146464,-0.159992,-0.154972,0.158043,0.236342,0.372671,1.179828,-0.130565,0.08179,-0.34238,0.577803,-0.37386,-0.347379
4,YAL005C,0.008696,0.013079,-0.012893,0.02799,0.001865,-0.042005,-0.107713,-0.160484,-0.168482,-0.208016,0.149407,-0.001823,0.049596,0.303443,0.078054,-0.128913,-0.189226,-0.605869,-0.397194,-0.477565
5,YAL007C,0.005854,-0.098771,-0.105308,-0.042582,-0.033003,0.099439,-0.126739,-0.039834,-0.045409,-0.075457,0.137436,-0.43699,-0.308801,-0.01832,-0.045283,0.648567,-0.227064,-0.140609,-0.058918,-0.152223
6,YAL008W,-0.01065,0.026314,0.006661,-0.050448,0.018098,-0.022211,-0.230093,-0.09755,-0.144162,-0.177283,0.067915,0.049672,0.125427,-0.054187,0.135471,-0.020114,-0.432616,-0.36318,-0.330348,-0.402137


# Print out

In [46]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [47]:
from IO.save_data_to_db3 import *

In [48]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/10 [00:00<?, ?it/s]

Deleting all datasets for PMID 28592509...
Inserting the new data...


100%|██████████| 10/10 [00:52<00:00,  5.20s/it]

Updating the data_modified_on field...



