In [1]:
import numpy as np
import pandas as pd

import sys
import re

from os.path import expanduser
sys.path.append(expanduser('~') + '/Lab/Utils/Python/')

from Conversions.translate import *
from Strings.is_a import *
from Math.normalize import z_transform_mode

# Initial setup

In [2]:
paper_pmid = 16365294
paper_name = 'ohya_morishita_2005' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [4]:
datasets.set_index('pmid', inplace=True)

In [5]:
path_to_genes = '../../Private-Utils/datasets_gene2.txt'
path_to_consensus_tested = '../../Private-Utils/yp_2020-09-01_orfs.txt'

# Load & process the data

In [6]:
original_data = pd.read_csv('raw_data/mutant_analysis_2011_10_20.tab', sep='\t')

In [7]:
original_data.head()

Unnamed: 0,name,A101_A,A101_A1B,A101_C,A102_A1B,A102_C,A103_A1B,A103_C,A104_A1B,A104_C,...,DCV192_C,DCV193_A1B,DCV193_C,DCV194_A,DCV194_C,DCV195_C,DCV196_A1B,DCV196_C,DCV197_C,DCV198_C
0,YAL002W,0.241397,0.203596,0.265229,0.674227,0.634847,0.047427,0.038288,0.147991,0.114015,...,0.242811,0.265602,0.223247,0.100176,0.102437,0.136084,0.086192,0.112866,0.276915,0.347655
1,YAL004W,0.176769,0.171474,0.215702,0.698654,0.651637,0.042315,0.050097,0.127242,0.130586,...,0.334429,0.320537,0.326231,0.081514,0.08105,0.097439,0.075716,0.094022,0.186023,0.238855
2,YAL005C,0.419048,0.380287,0.386104,0.447943,0.531675,0.092326,0.073756,0.092021,0.089312,...,0.414805,0.424925,0.401604,0.098086,0.104103,0.113818,0.083131,0.088795,0.221838,0.314937
3,YAL007C,0.253365,0.220201,0.241407,0.584807,0.594684,0.041358,0.039212,0.113065,0.100998,...,0.429069,0.605917,0.414244,0.092948,0.12672,0.126254,0.107495,0.126775,0.228581,0.330954
4,YAL008W,0.198676,0.188937,0.213454,0.70533,0.666906,0.035833,0.026512,0.131503,0.120423,...,0.314824,0.358865,0.298404,0.107541,0.144381,0.161155,0.10024,0.14084,0.240061,0.318372


In [8]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4718 x 502


In [9]:
original_data['name'] = original_data['name'].astype(str)

In [10]:
# Eliminate all white spaces & capitalize
original_data['name'] = clean_orf(original_data['name'])

In [11]:
# Translate to ORFs 
original_data['name'] = translate_sc(original_data['name'], to='orf')

In [12]:
# Make sure everything translated ok
t = looks_like_orf(original_data['name'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [name, A101_A, A101_A1B, A101_C, A102_A1B, A102_C, A103_A1B, A103_C, A104_A1B, A104_C, A105, A105_A, A106, A106_A, A107, A107_A1B, A107_C, A108, A108_A1B, A108_C, A109, A109_A1B, A109_C, A110, A110_A1B, A110_C, A111, A112, A112_A1B, A112_C, A113, A113_A, A113_A1B, A113_C, A114, A115, A116, A117, A118, A119, A120_A, A120_A1B, A120_C, A121_A, A121_A1B, A121_C, A122_A, A122_A1B, A122_C, A123_A, A123_A1B, A123_C, A7-1_A, A7-1_A1B, A7-1_C, A7-2_A1B, A7-2_C, A8-1_A, A8-1_A1B, A8-1_C, A8-2_A1B, A8-2_C, A9_A1B, A9_C, ACV101_A, ACV101_A1B, ACV101_C, ACV102_A1B, ACV102_C, ACV103_A1B, ACV103_C, ACV104_A1B, ACV104_C, ACV120_A, ACV120_A1B, ACV120_C, ACV121_A, ACV121_A1B, ACV121_C, ACV122_A, ACV122_A1B, ACV122_C, ACV123_A, ACV123_A1B, ACV123_C, ACV7-1_A, ACV7-1_A1B, ACV7-1_C, ACV7-2_A1B, ACV7-2_C, ACV8-1_A, ACV8-1_A1B, ACV8-1_C, ACV8-2_A1B, ACV8-2_C, ACV9_A1B, ACV9_C, C101_A1B, C101_C, C102_A1B, ...]
Index: []

[0 rows x 502 columns]


In [13]:
original_data.set_index('name', inplace=True)
original_data.index.name='orf'

In [14]:
original_data = original_data.groupby(original_data.index).mean()

In [15]:
original_data.shape

(4715, 501)

# Process parameters

In [16]:
datasets.head()

Unnamed: 0_level_0,name
pmid,Unnamed: 1_level_1
5405,hap a | Length of the long axis of the nucleus...
5406,hap a | Length of the long axis of the nucleus...
5407,hap a | Length of the short axis of the cell i...
5408,hap a | Length of the short axis of the daugth...
5409,hap a | Length of the short axis of the daugth...


In [17]:
datasets['phenotype_name'] = ''
datasets['phenotype_id'] = ''
for d in datasets.index.values:
    t = datasets.loc[d,'name'].split('|')[1]
    matches = re.findall('\(([A-Z0-9_\-]*?)\)', t)
    
    datasets.loc[d,'phenotype_name'] = t
    datasets.loc[d,'phenotype_id'] = matches[0]

In [18]:
# Exclude the CV parameters and the ones that are not easily interpretable
original_data = original_data.reindex(columns=datasets['phenotype_id'].values)

# Prepare the final dataset

In [19]:
data = original_data.copy()

In [20]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [21]:
data.head()

dataset_id,5405,5406,5407,5408,5409,5410,5411,5412,5413,5414,...,5647,5648,5649,5650,5651,5652,5653,5654,5655,5656
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL002W,10.760599,12.093151,28.587365,18.875788,27.630382,31.276813,31.189958,141.775,144.410448,139.309278,...,0.291139,33.279666,15.705237,27.323638,36.134226,35.849083,11.206696,13.043053,107.620552,104.79323
YAL004W,10.198289,11.566228,27.898675,17.140943,25.662899,29.756041,29.990882,170.021739,184.736111,189.127273,...,0.295455,34.080315,15.140221,25.87816,36.172433,36.510306,11.36222,12.338489,130.39264,118.121328
YAL005C,9.39264,10.667163,24.999431,16.894838,25.123329,28.395172,27.862895,113.470588,126.586466,124.925,...,0.339056,31.042787,14.454373,24.72785,34.522438,34.838594,10.628542,13.590983,91.6536,82.918831
YAL007C,9.652285,11.673625,28.042097,19.529432,26.086144,30.465722,29.918406,94.535714,92.268966,116.564103,...,0.233108,34.102268,17.926,26.965144,36.89271,37.060877,11.165609,14.11461,67.751113,68.21355
YAL008W,9.183315,11.184889,28.679312,18.826658,26.075371,31.173199,30.523229,108.119658,112.297101,135.786667,...,0.292237,34.633543,16.949242,26.381888,37.79021,37.355775,10.533115,13.599617,84.127483,82.816581


## Subset to the genes currently in SGD

In [22]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=original_data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 20


In [23]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

In [24]:
data.head()

Unnamed: 0_level_0,dataset_id,5405,5406,5407,5408,5409,5410,5411,5412,5413,5414,...,5647,5648,5649,5650,5651,5652,5653,5654,5655,5656
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,10.760599,12.093151,28.587365,18.875788,27.630382,31.276813,31.189958,141.775,144.410448,139.309278,...,0.291139,33.279666,15.705237,27.323638,36.134226,35.849083,11.206696,13.043053,107.620552,104.79323
1863,YAL004W,10.198289,11.566228,27.898675,17.140943,25.662899,29.756041,29.990882,170.021739,184.736111,189.127273,...,0.295455,34.080315,15.140221,25.87816,36.172433,36.510306,11.36222,12.338489,130.39264,118.121328
4,YAL005C,9.39264,10.667163,24.999431,16.894838,25.123329,28.395172,27.862895,113.470588,126.586466,124.925,...,0.339056,31.042787,14.454373,24.72785,34.522438,34.838594,10.628542,13.590983,91.6536,82.918831
5,YAL007C,9.652285,11.673625,28.042097,19.529432,26.086144,30.465722,29.918406,94.535714,92.268966,116.564103,...,0.233108,34.102268,17.926,26.965144,36.89271,37.060877,11.165609,14.11461,67.751113,68.21355
6,YAL008W,9.183315,11.184889,28.679312,18.826658,26.075371,31.173199,30.523229,108.119658,112.297101,135.786667,...,0.292237,34.633543,16.949242,26.381888,37.79021,37.355775,10.533115,13.599617,84.127483,82.816581


# Normalize


In [25]:
def normalize_phenotypic_scores(df, has_tested=False):
    
    if not has_tested:
        
        genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
        genes = genes.reset_index().set_index('systematic_name', drop=False)
        
        yp_orfs = pd.read_csv(path_to_consensus_tested, header=None)
        yp_orfs = yp_orfs[1].values
        consensus_tested = [tuple(genes.loc[orf,['id','systematic_name']]) for orf in yp_orfs]
        
        df_index = [tuple(x) for x in df.index]
        
        consensus_tested = list(set(consensus_tested + df_index))
        
        df = df.reindex(index=consensus_tested, fill_value=0)
        
    df_norm = z_transform_mode(data)
    
    return df_norm

In [26]:
data_norm = normalize_phenotypic_scores(data)

In [27]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [28]:
data_norm[data.isnull()] = np.nan

In [29]:
data_all = data.join(data_norm)

In [30]:
data_all.head()

Unnamed: 0_level_0,dataset_id,5405,5406,5407,5408,5409,5410,5411,5412,5413,5414,...,5647,5648,5649,5650,5651,5652,5653,5654,5655,5656
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,10.760599,12.093151,28.587365,18.875788,27.630382,31.276813,31.189958,141.775,144.410448,139.309278,...,-0.371945,-0.070331,-0.159067,0.594541,0.064916,-0.17434,0.231727,0.050617,0.577823,0.969929
1863,YAL004W,10.198289,11.566228,27.898675,17.140943,25.662899,29.756041,29.990882,170.021739,184.736111,189.127273,...,-0.297492,0.278901,-0.54003,-0.14847,0.081119,0.108348,0.370839,-0.392437,1.723406,1.843186
4,YAL005C,9.39264,10.667163,24.999431,16.894838,25.123329,28.395172,27.862895,113.470588,126.586466,124.925,...,0.454767,-1.046025,-1.002466,-0.739757,-0.618641,-0.606348,-0.285411,0.395175,-0.225418,-0.463282
5,YAL007C,9.652285,11.673625,28.042097,19.529432,26.086144,30.465722,29.918406,94.535714,92.268966,116.564103,...,-1.373165,0.288476,1.338291,0.410266,0.386588,0.343731,0.194977,0.72445,-1.427867,-1.426772
6,YAL008W,9.183315,11.184889,28.679312,18.826658,26.075371,31.173199,30.523229,108.119658,112.297101,135.786667,...,-0.352997,0.520211,0.679708,0.110458,0.767216,0.469807,-0.370767,0.400604,-0.60403,-0.469982


# Print out

In [31]:
for f in ['value','valuez']:
    df = data_all.xs('value', level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [32]:
from IO.save_data_to_db3 import *

In [33]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 16365294...


  0%|          | 0/252 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 252/252 [28:29<00:00,  6.78s/it]
