In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 23478965
paper_name = 'richie_hoepfner_2013' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [29]:
files = ['HOP-exp-scores-annotation.txt','HIP-exp-scores-annotation.txt']

In [30]:
original_data_list = []
for f in files:
    original_data = pd.read_csv('large_files/raw_data/' + f, sep='\t')
    print('Original data dimensions: %d x %d' % (original_data.shape))
    original_data = pd.pivot_table(original_data, index='Systematic Names', columns='Compound_conc', values='Score', fill_value=np.nan)
    print(original_data.shape)
    
    original_data['orf'] = original_data.index.values
    original_data['orf'] = clean_orf(original_data['orf'])
    original_data['orf'] = translate_sc(original_data['orf'], to='orf')
    original_data.loc['YPR099C','orf'] = 'YPR099C'
    
    t = looks_like_orf(original_data['orf'])
    print(original_data.loc[~t,])
    original_data = original_data.loc[t,:]
    original_data.set_index('orf', inplace=True)
    original_data = original_data.groupby(original_data.index).mean()
    print(original_data.shape)
    
    original_data_list.append(original_data)

Original data dimensions: 51262 x 28
(4523, 11)
Compound_conc  Chlorimuron_10  Compound1_3  Compound1_4  Compound2_6  \
index_input                                                            
YAR040C              0.133576    -0.016191     0.138273     0.062301   
YAR043C              0.042087     0.229566    -1.105603    -0.143871   
YCL006C             -0.023174     0.749857     0.010612     0.235295   
YCL074W             -0.050601    -0.012967     0.053645     0.043111   
YCL075W              0.030026    -0.002683     0.016945     0.027549   
YER108C             -1.123969    -0.001264    -0.065715    -0.010246   
YER109C             -0.034964    -0.151764    -0.050662    -0.320496   
YFL056C              0.298193     0.009800     1.304053     0.888849   
YIL170W             -2.080307    -6.010080    -5.886944    -9.026390   
YOR031W              0.630187    -0.006942    -0.202416     0.637861   

Compound_conc  Compound3_1.5  Compound4_10  Compound5_8  Compound6_0.5  \
index_input  

In [31]:
original_data = original_data_list[0].join(original_data_list[1], how='outer', lsuffix='_1', rsuffix='_2')

In [32]:
original_data.head()

Compound_conc,Chlorimuron_10_1,Compound1_3_1,Compound1_4_1,Compound2_6_1,Compound3_1.5_1,Compound4_10_1,Compound5_8_1,Compound6_0.5_1,Compound7_4_1,Compound8_4_1,...,Compound1_3_2,Compound1_4_2,Compound2_6_2,Compound3_1.5_2,Compound4_10_2,Compound5_8_2,Compound6_0.5_2,Compound7_4_2,Compound8_4_2,Sulfometuron_30_2
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YAL001C,0.038991,-0.056542,-0.294365,-0.104996,-1.559175,-1.422066,-0.524621,0.104708,-0.091587,-1.362973,...,0.989844,-0.000673,0.475843,0.109321,0.016547,0.53654,-0.02103,-0.139963,-0.010535,0.397133
YAL002W,0.806608,0.820446,0.010005,0.119045,-0.193387,0.031665,1.723395,-0.296865,-0.061796,1.077705,...,-3.417776,-2.1648,-2.048566,0.017055,-0.020341,-2.35408,-0.007522,0.055593,0.068183,-1.123647
YAL003W,,,,,,,,,,,...,0.08097,0.040677,-0.255405,0.534364,0.00045,0.325886,0.099893,0.071327,0.089027,0.092639
YAL004W,-0.97644,-1.507391,-0.867939,-0.288604,-0.21431,-0.041421,-0.971548,-0.132192,-0.216003,-0.03166,...,-2.620577,-2.515891,-2.120231,-3.099178,-3.512203,-2.966133,-1.741943,-0.393509,-1.741337,-5.223883
YAL005C,0.11598,0.005257,-0.052888,-0.169719,-0.320672,-0.152344,-0.554363,0.615376,-0.005581,-0.620674,...,0.257501,-1.169363,7.246556,2.860275,9.259456,6.642405,1.020024,5.084951,3.035065,4.89057


# Prepare the final dataset

In [33]:
data = original_data.copy()

In [34]:
dataset_ids_hop = [16018, 16008, 16009, 16010, 16011, 16012, 16013, 16014, 16015, 16016, 16017]
dataset_ids_hip = [16029, 16019, 16020, 16021, 16022, 16023, 16024, 16025, 16026, 16027, 16028]
dataset_ids = dataset_ids_hop + dataset_ids_hip

datasets = datasets.reindex(index=dataset_ids)

In [35]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [36]:
data.head()

dataset_id,16018,16008,16009,16010,16011,16012,16013,16014,16015,16016,...,16019,16020,16021,16022,16023,16024,16025,16026,16027,16028
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL001C,0.038991,-0.056542,-0.294365,-0.104996,-1.559175,-1.422066,-0.524621,0.104708,-0.091587,-1.362973,...,0.989844,-0.000673,0.475843,0.109321,0.016547,0.53654,-0.02103,-0.139963,-0.010535,0.397133
YAL002W,0.806608,0.820446,0.010005,0.119045,-0.193387,0.031665,1.723395,-0.296865,-0.061796,1.077705,...,-3.417776,-2.1648,-2.048566,0.017055,-0.020341,-2.35408,-0.007522,0.055593,0.068183,-1.123647
YAL003W,,,,,,,,,,,...,0.08097,0.040677,-0.255405,0.534364,0.00045,0.325886,0.099893,0.071327,0.089027,0.092639
YAL004W,-0.97644,-1.507391,-0.867939,-0.288604,-0.21431,-0.041421,-0.971548,-0.132192,-0.216003,-0.03166,...,-2.620577,-2.515891,-2.120231,-3.099178,-3.512203,-2.966133,-1.741943,-0.393509,-1.741337,-5.223883
YAL005C,0.11598,0.005257,-0.052888,-0.169719,-0.320672,-0.152344,-0.554363,0.615376,-0.005581,-0.620674,...,0.257501,-1.169363,7.246556,2.860275,9.259456,6.642405,1.020024,5.084951,3.035065,4.89057


## Subset to the genes currently in SGD

In [37]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [38]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16018,16008,16009,16010,16011,16012,16013,16014,16015,16016,...,16019,16020,16021,16022,16023,16024,16025,16026,16027,16028
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,0.038991,-0.056542,-0.294365,-0.104996,-1.559175,-1.422066,-0.524621,0.104708,-0.091587,-1.362973,...,0.989844,-0.000673,0.475843,0.109321,0.016547,0.53654,-0.02103,-0.139963,-0.010535,0.397133
2,YAL002W,0.806608,0.820446,0.010005,0.119045,-0.193387,0.031665,1.723395,-0.296865,-0.061796,1.077705,...,-3.417776,-2.1648,-2.048566,0.017055,-0.020341,-2.35408,-0.007522,0.055593,0.068183,-1.123647
3,YAL003W,,,,,,,,,,,...,0.08097,0.040677,-0.255405,0.534364,0.00045,0.325886,0.099893,0.071327,0.089027,0.092639
1863,YAL004W,-0.97644,-1.507391,-0.867939,-0.288604,-0.21431,-0.041421,-0.971548,-0.132192,-0.216003,-0.03166,...,-2.620577,-2.515891,-2.120231,-3.099178,-3.512203,-2.966133,-1.741943,-0.393509,-1.741337,-5.223883
4,YAL005C,0.11598,0.005257,-0.052888,-0.169719,-0.320672,-0.152344,-0.554363,0.615376,-0.005581,-0.620674,...,0.257501,-1.169363,7.246556,2.860275,9.259456,6.642405,1.020024,5.084951,3.035065,4.89057


# Normalize

In [39]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [40]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [41]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16018,16008,16009,16010,16011,16012,16013,16014,16015,16016,...,16019,16020,16021,16022,16023,16024,16025,16026,16027,16028
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,0.038991,-0.056542,-0.294365,-0.104996,-1.559175,-1.422066,-0.524621,0.104708,-0.091587,-1.362973,...,0.857126,0.03637,0.267268,0.132893,-0.056603,0.149708,0.072561,-0.007153,0.033299,0.175264
2,YAL002W,0.806608,0.820446,0.010005,0.119045,-0.193387,0.031665,1.723395,-0.296865,-0.061796,1.077705,...,-2.7324,-1.777897,-1.24103,0.088661,-0.072249,-1.188308,0.081683,0.097933,0.077784,-0.538998
3,YAL003W,,,,,,,,,,,...,0.116948,0.071036,-0.169642,0.33666,-0.063431,0.0522,0.154217,0.106388,0.089563,0.032253
1863,YAL004W,-0.97644,-1.507391,-0.867939,-0.288604,-0.21431,-0.041421,-0.971548,-0.132192,-0.216003,-0.03166,...,-2.083168,-2.07223,-1.283849,-1.40527,-1.553343,-1.471616,-1.089515,-0.1434,-0.944796,-2.464748
4,YAL005C,0.11598,0.005257,-0.052888,-0.169719,-0.320672,-0.152344,-0.554363,0.615376,-0.005581,-0.620674,...,0.260713,-0.943385,4.31267,1.451708,3.863832,2.976003,0.775551,2.800546,1.754401,2.285687


# Print out

In [42]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [43]:
from IO.save_data_to_db3 import *

In [44]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 23478965...


  0%|          | 0/22 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 22/22 [03:28<00:00,  9.48s/it]

Updating the data_modified_on field...



