In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 19004804
paper_name = 'bockhorn_kinzy_2008' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/0805642105_0805642105SI.xlsx', sheet_name='Sheet1')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 53 x 3


In [7]:
original_data.head()

Unnamed: 0,Telomere/chromatin maintenance,Unnamed: 1,Unnamed: 2
0,PDX3,YBR035C,6.5
1,,YOR008C-A,4.6
2,SSN2,YDR443C,4.2
3,REC107,YJR021C,4.1
4,SSN8,YNL025C,3.5


In [8]:
original_data['orf'] = original_data['Unnamed: 1'].astype(str)

In [9]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [10]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [11]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                       Telomere/chromatin maintenance  Unnamed: 1  Unnamed: 2  \
index_input                                                                     
8                                           Transport         NaN         NaN   
17                                 Response to stress         NaN         NaN   
26           RNA polymerase II promoter transcription         NaN         NaN   
33                               Cell ion homeostasis         NaN         NaN   
41                              Amino acid metabolism         NaN         NaN   
47                                        Translation         NaN         NaN   

             orf  
index_input       
8            NAN  
17           NAN  
26           NAN  
33           NAN  
41           NAN  
47           NAN  


In [12]:
original_data = original_data.loc[t,:]

In [13]:
original_data['data'] = original_data['Unnamed: 2'].astype(float)

In [14]:
original_data.set_index('orf', inplace=True)

In [15]:
original_data = original_data[['data']].copy()

In [16]:
original_data = original_data.groupby(original_data.index).mean()

In [17]:
original_data.shape

(43, 1)

# Prepare the final dataset

In [18]:
data = original_data.copy()

In [19]:
dataset_ids = [120]
datasets = datasets.reindex(index=dataset_ids)

In [20]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [21]:
data.head()

dataset_id,120
data_type,value
orf,Unnamed: 1_level_2
YAL012W,7.0
YBR035C,6.5
YBR036C,3.1
YBR043C,3.1
YBR069C,3.9


## Subset to the genes currently in SGD

In [22]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [23]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,120
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
10,YAL012W,7.0
231,YBR035C,6.5
232,YBR036C,3.1
239,YBR043C,3.1
265,YBR069C,3.9


# Normalize

In [24]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [25]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [26]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,120,120
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
10,YAL012W,7.0,18.894316
231,YBR035C,6.5,17.544722
232,YBR036C,3.1,8.367483
239,YBR043C,3.1,8.367483
265,YBR069C,3.9,10.526833


# Print out

In [27]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [28]:
from IO.save_data_to_db3 import *

In [29]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 19004804...
Inserting the new data...


100%|██████████| 1/1 [00:00<00:00,  6.09it/s]


Updating the data_modified_on field...
