In [2]:
import numpy as np
import pandas as pd

import sys

from os.path import expanduser
sys.path.append(expanduser('~') + '/Lab/Utils/Python/')

from Conversions.translate import *
from Strings.is_a import *

# Initial setup

In [34]:
paper_pmid = 18780730
paper_name = 'sinha_steinmetz_2008' 

In [35]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [36]:
datasets.set_index('pmid', inplace=True)

# Load & process the data

In [3]:
original_data = pd.read_csv('raw_data/deletion_pool_data.txt', sep='\t')

In [4]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 12489 x 23


In [6]:
original_data.columns

Index(['orf::batch:tagtype', 'Gene', 'T0', '30C_T1', '30C_T2', '30C_T3',
       '30C_T4', '30C_T5', '37C_T1', '37C_T2', '37C_T3', '37C_T4', '37C_T5',
       '30C_RAPA_T1', '30C_RAPA_T2', '30C_RAPA_T3', 'Description',
       'feature_qualifier', 'GO_process', 'GO_function', 'GO_component',
       'essential_gene', 'zygosity'],
      dtype='object')

In [7]:
original_data.head()

Unnamed: 0,orf::batch:tagtype,Gene,T0,30C_T1,30C_T2,30C_T3,30C_T4,30C_T5,37C_T1,37C_T2,...,30C_RAPA_T1,30C_RAPA_T2,30C_RAPA_T3,Description,feature_qualifier,GO_process,GO_function,GO_component,essential_gene,zygosity
0,YAL001C::chr1_1:uptag,TFC3,1192.6,1539.8,1190.4,1085.8,937.8,822.0,1214.4,1229.2,...,1284.0,1589.8,770.6,Largest of six subunits of the RNA polymerase ...,Verified,transcription initiation from RNA polymerase I...,RNA polymerase III transcription factor activity,mitochondrion*,yes,het
1,YAL002W::chr1_1:uptag,VPS8,1791.0,2296.4,1861.4,1181.6,1616.2,908.4,1305.2,1121.6,...,1228.2,1436.4,45.4,Membrane-associated hydrophilic protein that i...,Verified,late endosome to vacuole transport,molecular function unknown,membrane fraction,no,hom
2,YAL003W::chr1_1:uptag,EFB1,1521.4,2024.0,1517.4,1541.2,1606.0,1387.6,1469.6,1626.2,...,1473.0,1495.8,920.8,Translation elongation factor 1 beta; stimulat...,Verified,translational elongation,translation elongation factor activity,ribosome*,yes,het
3,YAL004W::chr1_1:uptag,YAL004W,1467.2,2204.6,1925.0,1705.0,2196.6,1450.4,1635.8,1936.6,...,2182.4,2748.0,662.4,,Dubious,,,,no,hom
4,YAL005C::chr1_1:uptag,SSA1,2280.2,3030.6,2477.4,2351.2,2602.0,2255.4,2484.0,2270.0,...,2334.0,3145.2,1515.4,ATPase involved in protein folding and nuclear...,Verified,translation*,ATPase activity*,cytoplasm*,no,hom


In [10]:
original_data['orfs'] = original_data['orf::batch:tagtype'].apply(lambda x: x.split(':')[0])

In [12]:
original_data['orfs'] = original_data['orfs'].astype(str)

In [13]:
# Eliminate all white spaces & capitalize
original_data['orfs'] = clean_orf(original_data['orfs'])

In [14]:
# Translate to ORFs 
original_data['orfs'] = translate_sc(original_data['orfs'], to='orf')

In [15]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orfs'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [orf::batch:tagtype, Gene, T0, 30C_T1, 30C_T2, 30C_T3, 30C_T4, 30C_T5, 37C_T1, 37C_T2, 37C_T3, 37C_T4, 37C_T5, 30C_RAPA_T1, 30C_RAPA_T2, 30C_RAPA_T3, Description, feature_qualifier, GO_process, GO_function, GO_component, essential_gene, zygosity, orfs]
Index: []

[0 rows x 24 columns]


In [16]:
original_data['37C'] = (original_data['37C_T5'] / original_data['T0']) / (original_data['30C_T5'] / original_data['T0'])

In [22]:
original_data.sort_values(by='37C', ascending=False)[['orfs','T0','30C_T1','30C_T2','30C_T3','30C_T4','30C_T5','37C_T1','37C_T2','37C_T3','37C_T4','37C_T5']].head()

Unnamed: 0_level_0,orfs,T0,30C_T1,30C_T2,30C_T3,30C_T4,30C_T5,37C_T1,37C_T2,37C_T3,37C_T4,37C_T5
index_input,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
9977,YNL226W,1875.4,1432.4,313.6,41.0,40.0,52.4,1620.6,1793.0,1867.8,1615.4,1710.4
11935,YPL205C,2316.2,2031.0,1082.5,407.6,114.4,52.0,2537.6,2129.6,2385.8,1439.6,1357.4
9264,YMR193C-A,2642.25,2391.8,914.4,341.6,271.2,61.4,2075.4,2055.6,1808.2,1684.4,1554.8
197,YBL039C,1082.4,606.8,289.4,79.6,211.8,49.2,895.4,1087.0,1005.6,1451.2,1093.0
2918,YDR463W,1007.2,581.2,169.0,41.0,63.0,54.8,1410.0,1701.0,1867.4,1408.2,1057.8


In [23]:
original_data['rapa_12h'] = original_data['30C_RAPA_T1'] / original_data['T0']
original_data['rapa_24h'] = original_data['30C_RAPA_T2'] / original_data['T0']
original_data['rapa_36h'] = original_data['30C_RAPA_T3'] / original_data['T0']

In [24]:
original_data.set_index('orfs', inplace=True)

In [25]:
# Splits homozygous and heterozygous mutants
original_data_hom = original_data.loc[original_data['zygosity']=='hom'].copy()
original_data_het = original_data.loc[original_data['zygosity']=='het'].copy()

In [28]:
original_data_hom = original_data_hom.groupby(original_data_hom.index).mean()
original_data_hom.shape

(4956, 18)

In [29]:
original_data_het = original_data_het.groupby(original_data_het.index).mean()
original_data_het.shape

(1142, 18)

In [42]:
# Pull them back together
data = original_data_hom[['37C','rapa_12h','rapa_24h','rapa_36h']].join(original_data_het[['37C','rapa_12h','rapa_24h','rapa_36h']],
                                                                       lsuffix='_hom', rsuffix='_het', how='outer')

# Prepare the final dataset

In [43]:
dataset_ids = [16511,16639,16640,16641,16638,16642,16643,16644]

In [44]:
datasets = datasets.reindex(index=dataset_ids)

In [45]:
data.columns = datasets['name'].values

In [46]:
data = data.groupby(data.index).mean()

In [47]:
# Create row index
data.index.name='orf'

In [48]:
print('Final data dimensions: %d x %d' % (data.shape))

Final data dimensions: 6074 x 8


# Print out

In [51]:
data.to_csv(paper_name + '.txt', sep='\t')

# Save to DB

In [50]:
from IO.save_data_to_db2 import *

In [52]:
# Create column index
lst = [datasets.index.values, datasets['name'].values]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','dataset_name'])
data.columns = idx

In [53]:
save_data_to_db(data, paper_pmid)