In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 22615293
paper_name = 'hoepfner_parker_2012' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
compounds = ['Cpd1','Cpd2','Cpd3','Cpd4','Vori']

In [15]:
original_data_list = {'HOP': [], 'HIP': []}
for c in compounds:
    original_data = pd.read_csv('raw_data/HIP-HOP Scores ' + c + '.txt', sep='\t')
    
    for et in ['HOP','HIP']:
        original_data1 = original_data.loc[original_data['EXPERIMENT_TYPE']==et,:].copy()
        original_data11 = pd.pivot_table(original_data1, index='GENE_NAME', columns='COMPOUND_CONCENTRATION', values='SCORE')
    
        original_data11 = original_data11.reset_index()
        original_data11['GENE_NAME'] = clean_genename(original_data11['GENE_NAME'])
        original_data11['orf'] = translate_sc(original_data11['GENE_NAME'], to='orf')
        
        # Make sure everything translated ok
        t = looks_like_orf(original_data11['orf'])
#         print(original_data11.loc[~t,])
        original_data11 = original_data11.loc[t,:]
        
        original_data11.set_index('orf', inplace=True)
        original_data11 = original_data11.groupby(original_data11.index).mean()
        
        # Rename columns
        cols = original_data11.columns.values
        cols = [('%s_%s' % (c, d.split('_')[1])) for d in cols]
        original_data11.columns = cols
        
        print(original_data11.shape)
        print(cols)
        
        original_data_list[et].append(original_data11)

(4496, 1)
['Cpd1_12']
(5747, 2)
['Cpd1_12', 'Cpd1_22.02']
(4483, 6)
['Cpd2_0.1', 'Cpd2_0.25', 'Cpd2_0.5', 'Cpd2_0.75', 'Cpd2_1', 'Cpd2_2']
(5711, 6)
['Cpd2_0.1', 'Cpd2_0.25', 'Cpd2_0.5', 'Cpd2_0.75', 'Cpd2_1', 'Cpd2_2']
(4477, 3)
['Cpd3_0.1', 'Cpd3_0.25', 'Cpd3_0.5']
(5701, 3)
['Cpd3_0.1', 'Cpd3_0.25', 'Cpd3_0.5']
(4477, 1)
['Cpd4_200']
(5701, 1)
['Cpd4_200']
(4489, 5)
['Vori_0.05', 'Vori_0.1', 'Vori_0.25', 'Vori_0.5', 'Vori_0.53']
(5724, 5)
['Vori_0.05', 'Vori_0.1', 'Vori_0.25', 'Vori_0.5', 'Vori_0.53']


In [16]:
original_data1 = pd.concat(original_data_list['HOP'], axis=1)
original_data2 = pd.concat(original_data_list['HIP'], axis=1)

In [17]:
doses = pd.read_excel('raw_data/doses_datasetids.xlsx', sheet_name='Sheet1', header=None)

In [18]:
doses[4] = doses[2].apply(lambda x: ('%f' % x).rstrip('0').rstrip('.'))

In [20]:
doses[5] = doses[0] + '_' + doses[4].astype(str)

In [23]:
doses1 = doses.loc[doses[1]=='HOP',:].copy()
doses2 = doses.loc[doses[1]=='HIP',:].copy()

In [28]:
doses1.set_index(5, inplace=True)
doses2.set_index(5, inplace=True)

In [30]:
doses1 = doses1.reindex(index=original_data1.columns.values)
doses2 = doses2.reindex(index=original_data2.columns.values)

In [34]:
original_data1.columns = doses1[3].values
original_data2.columns = doses2[3].values

In [35]:
original_data = original_data1.join(original_data2, how='outer', lsuffix='_1', rsuffix='_2')

In [37]:
original_data.columns

Int64Index([16216, 16300, 16301, 16217, 16302, 16303, 16304, 16305, 16218,
            16306, 16219, 16220, 16307, 16308, 16309, 16310, 16221, 16288,
            16289, 16290, 16222, 16291, 16292, 16293, 16294, 16223, 16295,
            16224, 16225, 16296, 16297, 16298, 16299],
           dtype='int64')

In [44]:
original_data.index.name = 'orf'

# Prepare the final dataset

In [45]:
data = original_data.copy()

In [46]:
dataset_ids = original_data.columns.values
datasets = datasets.reindex(index=dataset_ids)

In [47]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [48]:
data.head()

dataset_id,16216,16300,16301,16217,16302,16303,16304,16305,16218,16306,...,16293,16294,16223,16295,16224,16225,16296,16297,16298,16299
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL001C,-0.058527,-0.141784,-0.035717,-0.655453,0.21784,-2.706992,-1.636977,-3.053695,-3.998915,-2.169862,...,-0.015957,0.279931,-1.838645,-0.041962,-1.517082,1.056342,-0.039514,0.069853,-0.066318,-0.033435
YAL002W,3.087028,-0.097408,2.19268,0.999218,0.049157,-0.25828,0.890135,-0.081305,0.754535,-0.005043,...,-2.304102,0.909857,-0.783131,-2.053965,-1.495601,-0.079287,-0.029005,-1.314205,-0.090096,-1.465725
YAL003W,,,,,,,,,,,...,0.523414,0.743197,-0.643358,0.09743,-1.38335,-0.716209,-0.035322,0.020131,0.024539,-0.084485
YAL004W,-0.052807,-0.160334,-0.039348,-0.502587,0.515579,-1.150181,-1.204039,-0.61704,-0.033897,-1.395892,...,-3.982115,-0.021937,-3.412733,-4.881368,-0.651329,1.015251,-0.066219,-0.382607,-0.001485,-1.094197
YAL005C,-0.006626,0.019759,0.144404,0.392702,1.057087,0.088645,-0.21034,-0.000973,-0.004581,0.004985,...,6.215828,1.396069,4.90809,6.443834,0.461616,5.834788,-0.5772,-0.761591,4.099758,0.869126


## Subset to the genes currently in SGD

In [49]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 7


In [50]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16216,16300,16301,16217,16302,16303,16304,16305,16218,16306,...,16293,16294,16223,16295,16224,16225,16296,16297,16298,16299
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,-0.058527,-0.141784,-0.035717,-0.655453,0.21784,-2.706992,-1.636977,-3.053695,-3.998915,-2.169862,...,-0.015957,0.279931,-1.838645,-0.041962,-1.517082,1.056342,-0.039514,0.069853,-0.066318,-0.033435
2,YAL002W,3.087028,-0.097408,2.19268,0.999218,0.049157,-0.25828,0.890135,-0.081305,0.754535,-0.005043,...,-2.304102,0.909857,-0.783131,-2.053965,-1.495601,-0.079287,-0.029005,-1.314205,-0.090096,-1.465725
3,YAL003W,,,,,,,,,,,...,0.523414,0.743197,-0.643358,0.09743,-1.38335,-0.716209,-0.035322,0.020131,0.024539,-0.084485
1863,YAL004W,-0.052807,-0.160334,-0.039348,-0.502587,0.515579,-1.150181,-1.204039,-0.61704,-0.033897,-1.395892,...,-3.982115,-0.021937,-3.412733,-4.881368,-0.651329,1.015251,-0.066219,-0.382607,-0.001485,-1.094197
4,YAL005C,-0.006626,0.019759,0.144404,0.392702,1.057087,0.088645,-0.21034,-0.000973,-0.004581,0.004985,...,6.215828,1.396069,4.90809,6.443834,0.461616,5.834788,-0.5772,-0.761591,4.099758,0.869126


# Normalize

In [51]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [52]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [53]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16216,16300,16301,16217,16302,16303,16304,16305,16218,16306,...,16293,16294,16223,16295,16224,16225,16296,16297,16298,16299
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,-0.058527,-0.141784,-0.035717,-0.655453,0.21784,-2.706992,-1.636977,-3.053695,-3.998915,-2.169862,...,-0.013818,0.409101,-1.269656,0.048974,-1.985288,0.81461,-0.072332,0.198471,-0.069725,-0.128398
2,YAL002W,3.087028,-0.097408,2.19268,0.999218,0.049157,-0.25828,0.890135,-0.081305,0.754535,-0.005043,...,-1.189126,1.010352,-0.494409,-0.902822,-1.958202,-0.058169,-0.060722,-1.042948,-0.083187,-2.103482
3,YAL003W,,,,,,,,,,,...,0.263231,0.851279,-0.39175,0.114916,-1.81666,-0.547671,-0.067701,0.153874,-0.018285,-0.198795
1863,YAL004W,-0.052807,-0.160334,-0.039348,-0.502587,0.515579,-1.150181,-1.204039,-0.61704,-0.033897,-1.395892,...,-2.051039,0.120974,-2.42578,-2.240352,-0.893628,0.78303,-0.101837,-0.207359,-0.033019,-1.591157
4,YAL005C,-0.006626,0.019759,0.144404,0.392702,1.057087,0.088645,-0.21034,-0.000973,-0.004581,0.004985,...,3.187145,1.474432,3.685638,3.117142,0.509727,4.487048,-0.666374,-0.547285,2.288941,1.116207


# Print out

In [54]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [55]:
from IO.save_data_to_db3 import *

In [56]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 22615293...


  0%|          | 0/33 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 33/33 [04:50<00:00,  8.80s/it]

Updating the data_modified_on field...



