In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 15575969
paper_name = 'haugen_vanhouten_2004' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/gb-2004-5-12-r95-s5.xlsx', sheet_name='Query2')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4762 x 26


In [7]:
original_data.head()

Unnamed: 0,ORF,"""record""",Gene,Desc,Unnamed: 4,Arsenite/rank,e,n,F,arn1,...,F.2,arn3,SD.2,n.3,F.3,arn4,SD.3,n.4,F.4,FITNESS pval
0,YCR053W,37186.0,THR4,threonine metabolism,1.0,0.339088,4.0,16.0,4.0,-0.330869,...,1.0,-1.409924,0.182272,4.0,3.0,-1.299462,0.140078,4.0,,0.000232
1,YOR184W,32440.0,SER1,purine base biosynthesis*,2.0,0.476169,4.0,16.0,0.0,-0.586149,...,,-0.869176,0.141069,4.0,,-0.640489,0.078476,4.0,,0.000464
2,YGR208W,34838.0,SER2,serine family amino acid biosynthesis,3.0,0.48446,4.0,12.0,0.0,-0.667408,...,,-0.777414,0.052166,3.0,,-0.578673,0.023409,3.0,,0.000696
3,YJR109C,36916.0,CPA2,arginine biosynthesis,4.0,0.485299,4.0,16.0,3.0,-0.327605,...,,-0.923412,0.058895,4.0,,-1.018208,0.056578,4.0,3.0,0.000928
4,YOR303W,37337.0,CPA1,arginine biosynthesis,5.0,0.485795,4.0,16.0,3.0,-0.434978,...,,-0.848264,0.094693,4.0,1.0,-0.89074,0.105424,4.0,1.0,0.00116


In [8]:
original_data['orf'] = original_data['ORF'].astype(str)

In [9]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [10]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [11]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                           ORF  "record" Gene  \
index_input                                                                     
4757                                                       NaN       NaN  NaN   
4758         Additional data file 5.  The ranked arsenite s...       NaN  NaN   
4759         These were ranked based on four experiments, 1...       NaN  NaN   
4760         Every gene in this table has a percentile rank...       NaN  NaN   
4761         The rankings on this table were used for the m...       NaN  NaN   

            Desc  Unnamed: 4  Arsenite/rank   e   n   F  arn1  ...  arn3  \
index_input                                                    ...         
4757         NaN         NaN            NaN NaN NaN NaN   NaN  ...   NaN   
4758         NaN         NaN            NaN NaN NaN NaN   NaN  ...   NaN   
4759         NaN         NaN            NaN NaN NaN NaN   NaN  ...   NaN   
4760         NaN         NaN            NaN NaN NaN 

In [12]:
original_data = original_data.loc[t,:]

In [17]:
original_data['data'] = pd.to_numeric(original_data['FITNESS pval'], errors='coerce')

In [18]:
original_data.set_index('orf', inplace=True)

In [19]:
original_data = original_data[['data']].copy()

In [20]:
original_data = original_data.groupby(original_data.index).mean()

In [21]:
original_data.shape

(4716, 1)

# Prepare the final dataset

In [22]:
data = original_data.copy()

In [23]:
dataset_ids = [485]
datasets = datasets.reindex(index=dataset_ids)

In [24]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [25]:
data.head()

dataset_id,485
data_type,value
orf,Unnamed: 1_level_2
YAL002W,0.849617
YAL004W,0.859828
YAL005C,0.092829
YAL007C,0.116268
YAL008W,0.233233


## Subset to the genes currently in SGD

In [26]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [27]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,485
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,0.849617
1863,YAL004W,0.859828
4,YAL005C,0.092829
5,YAL007C,0.116268
6,YAL008W,0.233233


# Normalize

In [28]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [29]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [30]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,485,485
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,0.849617,1.255817
1863,YAL004W,0.859828,1.293032
4,YAL005C,0.092829,-1.502375
5,YAL007C,0.116268,-1.416948
6,YAL008W,0.233233,-0.990659


# Print out

In [31]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [32]:
from IO.save_data_to_db3 import *

In [33]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 15575969...
Inserting the new data...


100%|██████████| 1/1 [00:08<00:00,  8.47s/it]

Updating the data_modified_on field...



