In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 18157128
paper_name = 'delneri_oliver_2008' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [91]:
original_data = pd.read_excel('raw_data/Table 4s_NewForPublishing.xlsx', sheet_name='CL, NL, PL & GJ  data', skiprows=10)

In [92]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 1247 x 49


In [93]:
original_data.head()

Unnamed: 0,ORF,Growth Rate,FDR(BH),Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,ORF.1,Growth Rate.1,FDR(BH).1,...,Unnamed: 39,ORF.6,Growth Rate.6,FDR(BH).6,Unnamed: 43,Unnamed: 44,Unnamed: 45,ORF.7,Growth Rate.7,FDR(BH).7
0,,,,,,,,,,,...,,,,,,,,,,
1,YEL027W,-0.107896,0.00096997,,,,,YOR182C,0.081684,0.00097,...,,YIL086C,-0.097003,0.000855,,,,YPL202C,0.078349,0.000855
2,YAL005C,-0.097399,0.00096997,,,,,YGL087C,0.076773,0.00097,...,,YCR037C,-0.057828,0.000855,,,,YJR154W,0.059105,0.000855
3,YNL039W,-0.094801,0.00096997,,,,,YGR257C,0.076172,0.00097,...,,YBR300C,-0.052981,0.000855,,,,YLR417W,0.032742,0.000855
4,YMR033W,-0.092898,0.00096997,,,,,YLR315W,0.069649,0.00097,...,,YCR006C,-0.052366,0.000855,,,,YKR036C,0.031759,0.000855


In [94]:
orf_cols = [c for c in original_data.columns if 'ORF' in c]
data_cols = [c for c in original_data.columns if 'Growth Rate' in c]

In [95]:
original_data_list = []
for o,d in zip(orf_cols, data_cols):
    t = original_data.loc[:,[o,d]].copy()
    t.columns = ['orf','data']
    original_data_list.append(t)

In [96]:
original_data_list2 = []
for d in np.arange(4):
    df = pd.concat(original_data_list[d*2:d*2+1], axis=0)
    
    df['orf'] = df['orf'].astype(str)
    df['orf'] = clean_orf(df['orf'])
    df['orf'] = translate_sc(df['orf'].values, to='orf')
    
    # Make sure everything translated ok
    t = looks_like_orf(df['orf'])
    print(df.loc[~t,])
    df = df.loc[t,:]
    
    df.set_index('orf', inplace=True)
    df = df[['data']].copy()
    df['data'] = pd.to_numeric(df['data'], errors='coerce')
    
    df = df.groupby(df.index).mean()
    print(df.shape)
    
    original_data_list2.append(df)

      orf data
0     NAN  NaN
665   NAN  NaN
666   NAN  NaN
667   NAN  NaN
668   NAN  NaN
...   ...  ...
1242  NAN  NaN
1243  NAN  NaN
1244  NAN  NaN
1245  NAN  NaN
1246  NAN  NaN

[583 rows x 2 columns]
(663, 1)
      orf  data
0     NAN   NaN
748   NAN   NaN
749   NAN   NaN
750   NAN   NaN
751   NAN   NaN
...   ...   ...
1242  NAN   NaN
1243  NAN   NaN
1244  NAN   NaN
1245  NAN   NaN
1246  NAN   NaN

[500 rows x 2 columns]
(745, 1)
   orf  data
0  NAN   NaN
(1244, 1)
      orf  data
0     NAN   NaN
208   NAN   NaN
209   NAN   NaN
210   NAN   NaN
211   NAN   NaN
...   ...   ...
1242  NAN   NaN
1243  NAN   NaN
1244  NAN   NaN
1245  NAN   NaN
1246  NAN   NaN

[1040 rows x 2 columns]
(207, 1)


In [97]:
original_data = pd.concat(original_data_list2, axis=1)

In [98]:
original_data.head()

Unnamed: 0,data,data.1,data.2,data.3
YAL002W,-0.014847,-0.021817,,
YAL005C,-0.097399,-0.085828,-0.046165,
YAL021C,-0.021303,-0.013752,-0.052153,
YAL035W,-0.073469,-0.048445,-0.044732,
YAL036C,-0.082845,-0.065893,-0.124891,-0.0283


In [99]:
original_data.index.name = 'orf'

In [100]:
original_data.shape

(1622, 4)

# Prepare the final dataset

In [101]:
data = original_data.copy()

In [102]:
dataset_ids = [11813,11815,11816,11814]
datasets = datasets.reindex(index=dataset_ids)

In [103]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [104]:
data.head()

dataset_id,11813,11815,11816,11814
data_type,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
YAL002W,-0.014847,-0.021817,,
YAL005C,-0.097399,-0.085828,-0.046165,
YAL021C,-0.021303,-0.013752,-0.052153,
YAL035W,-0.073469,-0.048445,-0.044732,
YAL036C,-0.082845,-0.065893,-0.124891,-0.0283


## Subset to the genes currently in SGD

In [105]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 5


In [106]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,11813,11815,11816,11814
Unnamed: 0_level_1,data_type,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2,YAL002W,-0.014847,-0.021817,,
4,YAL005C,-0.097399,-0.085828,-0.046165,
19,YAL021C,-0.021303,-0.013752,-0.052153,
33,YAL035W,-0.073469,-0.048445,-0.044732,
34,YAL036C,-0.082845,-0.065893,-0.124891,-0.0283


# Normalize

In [107]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [108]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [109]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,11813,11815,11816,11814,11813,11815,11816,11814
Unnamed: 0_level_1,data_type,value,value,value,value,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2,YAL002W,-0.014847,-0.021817,,,-0.984363,-1.353954,,
4,YAL005C,-0.097399,-0.085828,-0.046165,,-6.457598,-5.326449,-2.251078,
19,YAL021C,-0.021303,-0.013752,-0.052153,,-1.412399,-0.853443,-2.543062,
33,YAL035W,-0.073469,-0.048445,-0.044732,,-4.871028,-3.006476,-2.181203,
34,YAL036C,-0.082845,-0.065893,-0.124891,-0.0283,-5.492662,-4.089292,-6.089882,-4.422099


# Print out

In [110]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [111]:
from IO.save_data_to_db3 import *

In [112]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/4 [00:00<?, ?it/s]

Deleting all datasets for PMID 18157128...
Inserting the new data...


100%|██████████| 4/4 [00:09<00:00,  2.42s/it]

Updating the data_modified_on field...



