In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 20944018
paper_name = 'gresham_botstein_2011' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
files = ['TABLES4.xlsx','TABLES5.xlsx']
sheets = ['phoAbs.txt','leuAbs.txt']

In [6]:
original_data_list = []
for ixf, f in enumerate(files):
    original_data = pd.read_excel('raw_data/' + f, sheet_name=sheets[ixf])
    print('Original data dimensions: %d x %d' % (original_data.shape))
#     print(original_data.head())
    original_data['gene'] = original_data['gene'].astype(str)
    original_data['gene'] = original_data['gene'].apply(lambda x: x.split('_')[0])
    original_data['gene'] = clean_genename(original_data['gene'])
    original_data['orf'] = translate_sc(original_data['gene'], to='orf')
    t = looks_like_orf(original_data['orf'])
    print(original_data.loc[~t,])
    original_data = original_data.loc[t,:]
    original_data['data'] = original_data['halflife'].astype(float)
    original_data.set_index('orf', inplace=True)
    
    original_data = original_data[['data']].copy()
    original_data = original_data.groupby(original_data.index).mean()
    print(original_data.shape)
    
    original_data_list.append(original_data)

Original data dimensions: 6806 x 10
              gene  deathrate   p-value   q-value        SD  \
index_input                                                   
156           FLO8   0.003366  0.000041  0.000187  0.000251   
186           FLO8   0.002343  0.000050  0.000190  0.000181   
240           YSN1   0.001897  0.000067  0.000195  0.000156   
322           WSP1   0.002659  0.000093  0.000205  0.000234   
629           CRS5   0.002406  0.000197  0.000226  0.000248   
763           SDL1   0.002106  0.000258  0.000244  0.000230   
800           WSP1   0.002943  0.000275  0.000248  0.000325   
913          SDC25   0.002334  0.000331  0.000261  0.000268   
1412          MNI2   0.003401  0.000599  0.000306  0.000443   
1440         FMP35   0.007090  0.000616  0.000309  0.000929   
1559         TMA29   0.002595  0.000718  0.000332  0.000352   
1567          YSN1   0.002056  0.000727  0.000334  0.000279   
1891         FMP31   0.002714  0.001030  0.000393  0.000398   
2722         SDC25 

In [7]:
original_data = pd.concat(original_data_list, axis=1)

In [8]:
original_data.index.name='orf'

In [9]:
original_data.shape

(4317, 2)

In [10]:
original_data.head()

Unnamed: 0_level_0,data,data
orf,Unnamed: 1_level_1,Unnamed: 2_level_1
YAL002W,54.827553,28.600328
YAL004W,376.148747,15.677022
YAL005C,255.345049,18.101803
YAL007C,453.525695,28.61849
YAL008W,555.615837,45.128964


# Prepare the final dataset

In [11]:
data = original_data.copy()

In [12]:
dataset_ids = [93,451]
datasets = datasets.reindex(index=dataset_ids)

In [13]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [14]:
data.head()

dataset_id,93,451
data_type,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2
YAL002W,54.827553,28.600328
YAL004W,376.148747,15.677022
YAL005C,255.345049,18.101803
YAL007C,453.525695,28.61849
YAL008W,555.615837,45.128964


## Subset to the genes currently in SGD

In [15]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 12


In [16]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,93,451
Unnamed: 0_level_1,data_type,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,54.827553,28.600328
1863,YAL004W,376.148747,15.677022
4,YAL005C,255.345049,18.101803
5,YAL007C,453.525695,28.61849
6,YAL008W,555.615837,45.128964


# Normalize

In [38]:
# Cap the maximum value at 1e10 (due to the database field limitations)
data[data > 1e9] = 1e9
data[data < -1e9] = -1e9

In [39]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [40]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [41]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,93,451,93,451
Unnamed: 0_level_1,data_type,value,value,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2,YAL002W,54.827553,28.600328,-0.164015,-0.356347
1863,YAL004W,376.148747,15.677022,-0.09149,-0.356348
4,YAL005C,255.345049,18.101803,-0.118757,-0.356348
5,YAL007C,453.525695,28.61849,-0.074026,-0.356347
6,YAL008W,555.615837,45.128964,-0.050984,-0.356347


# Print out

In [42]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [43]:
from IO.save_data_to_db3 import *

In [44]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/2 [00:00<?, ?it/s]

Deleting all datasets for PMID 20944018...
Inserting the new data...


100%|██████████| 2/2 [00:12<00:00,  6.33s/it]

Updating the data_modified_on field...



