In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 24040173
paper_name = 'novo_gonzalez_2013' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [32]:
files = ['Only original foldchanges/Direct comparison/Phase I/HOP_t0vsYPD10 (dir) PhI.xlsx',
         'Only original foldchanges/Direct comparison/Phase I/HOP_t0vsMS10 (dir) PhI.xlsx',
         'Only original foldchanges/Direct comparison/Phase II/HOP_t0vsYPD10 (dir) PhII.xlsx',
         'Only original foldchanges/Direct comparison/Phase II/HOPt0vsHOP10 (dir) PhII.xlsx']

sheets = ['HOPt0vsYPD10_t0','HOPt0vsMS10_t0','HOPt0vsYPD10_t0','HOPt0vsHOP10_out0']

In [33]:
original_data_list = []
for ixf, f in enumerate(files):
    original_data = pd.read_excel('raw_data/' + f, sheet_name=sheets[ixf])
    print('Original data dimensions: %d x %d' % (original_data.shape))
    
    original_data = original_data.loc[original_data['essential_gene']=='no',:]
    
    original_data['orf'] = original_data['strain'].apply(lambda x: x.split(':')[0])
    original_data['orf'] = clean_orf(original_data['orf'])
    original_data['orf'] = translate_sc(original_data['orf'], to='orf')
    # Make sure everything translated ok
    t = looks_like_orf(original_data['orf'])
    print(original_data.loc[~t,])
    original_data = original_data.loc[t,:]
    
    original_data['data'] = original_data['Log2Ratio']
    original_data.set_index('orf', inplace=True)
    original_data = original_data[['data']].copy()
    original_data = original_data.groupby(original_data.index).mean()
    print(original_data.shape)
    
    original_data_list.append(original_data)

Original data dimensions: 6431 x 9
                               strain  Log2Ratio  p-value        gene  \
index_input                                                             
5732                 YLL017W::chr12_1  -0.660960   0.85492    YLL017W   
5742         YCL006C::chr3_1/chr00_12  -0.662117   0.90562  YCL006C_d   
5878                 YOR031W::chr15_1  -0.702252   0.98758       CRS5   
6084                  YAR037W::chr1_1  -0.781383   0.91388  YAR037W_d   
6103                  YFL056C::chr6_1  -0.789630   0.88873       AAD6   
6153                  YIL167W::chr9_2  -0.810263   0.89513       SDL1   
6344                  YAR043C::chr1_1  -0.988869   0.95602  YAR043C_d   

            essential_gene   feature_type  \
index_input                                 
5732                    no            NaN   
5742                    no   ORF::Deleted   
5878                    no  ORF::Verified   
6084                    no            NaN   
6103                    no  ORF::Veri

In [34]:
original_data1 = pd.concat(original_data_list, axis=1)

In [35]:
original_data1.columns = ['data1','data2','data3','data4']
original_data1.index.name='orf'

In [36]:
original_data1['data1'] = original_data1['data1'] - original_data1['data2']
original_data1['data3'] = original_data1['data3'] - original_data1['data4']

In [37]:
original_data1.drop(columns=['data2','data4'], inplace=True)

In [38]:
original_data1.head()

Unnamed: 0_level_0,data1,data3
orf,Unnamed: 1_level_1,Unnamed: 2_level_1
YAL002W,0.335177,0.767759
YAL004W,-0.472267,-2.102164
YAL005C,0.147738,-2.807503
YAL007C,0.027676,-0.571977
YAL008W,0.221088,1.626694


# Load & process data (het)

In [23]:
files = ['Only original foldchanges/Direct comparison/Phase I/HIP_t0vsYPD20 (dir) PhI.xlsx',
         'Only original foldchanges/Direct comparison/Phase I/HIP_t0vsMS20 (dir) PhI.xlsx',
         'Only original foldchanges/Direct comparison/Phase II/HIP_t0vsYPD20 (dir) PhII.xlsx',
         'Only original foldchanges/Direct comparison/Phase II/HIPt0vsHIP10 (dir) PhII.xlsx']

sheets = ['HIPt0vsYPD20_t0','HIPt0vsMS20_tr0','HIPt0vsYPD20_t0','HIPt0vsHIP10_out0']

In [24]:
original_data_list = []
for ixf, f in enumerate(files):
    original_data = pd.read_excel('raw_data/' + f, sheet_name=sheets[ixf])
    print('Original data dimensions: %d x %d' % (original_data.shape))
    
    original_data['orf'] = original_data['strain'].apply(lambda x: x.split(':')[0])
    original_data['orf'] = clean_orf(original_data['orf'])
    original_data['orf'] = translate_sc(original_data['orf'], to='orf')
    # Make sure everything translated ok
    t = looks_like_orf(original_data['orf'])
    print(original_data.loc[~t,])
    original_data = original_data.loc[t,:]
    
    original_data['data'] = original_data['Log2Ratio']
    original_data.set_index('orf', inplace=True)
    original_data = original_data[['data']].copy()
    original_data = original_data.groupby(original_data.index).mean()
    print(original_data.shape)
    
    original_data_list.append(original_data)

Original data dimensions: 6431 x 9
                        strain  Log2Ratio  p-value         gene  \
index_input                                                       
5761         YBR160W_AS::shawn   0.035693   0.57269  YBR160W_AS   

            essential_gene feature_type GO_process GO_function GO_component  \
index_input                                                                   
5761          undetermined          NaN        NaN         NaN          NaN   

                   orf  
index_input             
5761         YBR160WAS  
(6116, 1)
Original data dimensions: 6431 x 14
                        strain  Log2Ratio  p-value         gene  \
index_input                                                       
5121         YBR160W_AS::shawn   0.063321   0.49504  YBR160W_AS   

            essential_gene feature_type GO_process GO_function GO_component  \
index_input                                                                   
5121          undetermined          NaN     

In [25]:
original_data2 = pd.concat(original_data_list, axis=1)

In [26]:
original_data2.columns = ['data1','data2','data3','data4']
original_data2.index.name='orf'

In [27]:
original_data2['data1'] = original_data2['data1'] - original_data2['data2']
original_data2['data3'] = original_data2['data3'] - original_data2['data4']

In [28]:
original_data2.drop(columns=['data2','data4'], inplace=True)

In [31]:
original_data2.head()

Unnamed: 0_level_0,data1,data3
orf,Unnamed: 1_level_1,Unnamed: 2_level_1
YAL001C,-0.005866,0.952995
YAL002W,-0.072421,0.317301
YAL003W,-0.439218,1.511621
YAL004W,-2.083094,-0.466739
YAL005C,1.189116,1.875475


# Merge all

In [39]:
original_data = original_data1.join(original_data2, how='outer', lsuffix='_1', rsuffix='_2')

In [40]:
original_data.head()

Unnamed: 0_level_0,data1_1,data3_1,data1_2,data3_2
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
YAL001C,,,-0.005866,0.952995
YAL002W,0.335177,0.767759,-0.072421,0.317301
YAL003W,,,-0.439218,1.511621
YAL004W,-0.472267,-2.102164,-2.083094,-0.466739
YAL005C,0.147738,-2.807503,1.189116,1.875475


# Prepare the final dataset

In [41]:
data = original_data.copy()

In [42]:
dataset_ids = [197, 198, 15993, 15994]
datasets = datasets.reindex(index=dataset_ids)

In [43]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [44]:
data.head()

dataset_id,197,198,15993,15994
data_type,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
YAL001C,,,-0.005866,0.952995
YAL002W,0.335177,0.767759,-0.072421,0.317301
YAL003W,,,-0.439218,1.511621
YAL004W,-0.472267,-2.102164,-2.083094,-0.466739
YAL005C,0.147738,-2.807503,1.189116,1.875475


## Subset to the genes currently in SGD

In [45]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 27


In [46]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,197,198,15993,15994
Unnamed: 0_level_1,data_type,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,YAL001C,,,-0.005866,0.952995
2,YAL002W,0.335177,0.767759,-0.072421,0.317301
3,YAL003W,,,-0.439218,1.511621
1863,YAL004W,-0.472267,-2.102164,-2.083094,-0.466739
4,YAL005C,0.147738,-2.807503,1.189116,1.875475


# Normalize

In [47]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [48]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [49]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,197,198,15993,15994,197,198,15993,15994
Unnamed: 0_level_1,data_type,value,value,value,value,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1,YAL001C,,,-0.005866,0.952995,,,-0.039525,0.309928
2,YAL002W,0.335177,0.767759,-0.072421,0.317301,0.660212,0.975482,-0.149877,-0.290474
3,YAL003W,,,-0.439218,1.511621,,,-0.758051,0.837541
1863,YAL004W,-0.472267,-2.102164,-2.083094,-0.466739,-0.605812,-1.87685,-3.483708,-1.030988
4,YAL005C,0.147738,-2.807503,1.189116,1.875475,0.366319,-2.577866,1.941835,1.181195


# Print out

In [50]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [51]:
from IO.save_data_to_db3 import *

In [52]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/4 [00:00<?, ?it/s]

Deleting all datasets for PMID 24040173...
Inserting the new data...


100%|██████████| 4/4 [00:34<00:00,  8.66s/it]

Updating the data_modified_on field...



