In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 19633105
paper_name = 'teixeira_sa_correia_2009' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [11]:
original_data = pd.read_excel('raw_data/TableS1_suplementary_material.xlsx', sheet_name='Sheet1', skiprows=3)

In [12]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 282 x 2


In [13]:
original_data.head()

Unnamed: 0,Gene/ORF name,Descripion
0,VACUOLE,
1,V-ATPase organization and biogenesis,
2,CUP5,Proteolipid subunit of the vacuolar H(+)-ATPas...
3,PPA1,"Subunit c'' of the vacuolar ATPase, which func..."
4,RAV1,"Subunit of the RAVE complex (Rav1p, Rav2p, Skp..."


In [14]:
original_data['gene'] = original_data['Gene/ORF name'].astype(str)

In [15]:
# Eliminate all white spaces & capitalize
original_data['gene'] = clean_genename(original_data['gene'])

In [16]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['gene'], to='orf')

In [17]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                 Gene/ORF name Descripion  \
index_input                                                                 
0                                                      VACUOLE        NaN   
1                         V-ATPase organization and biogenesis        NaN   
16                                                  PEROXISOME        NaN   
17                      Peroxisome organization and biogenesis        NaN   
29                                                MITOCHONDRIA        NaN   
30                                           Protein Synthesis        NaN   
44                                   Oxidative phosphorylation        NaN   
49                                                 F0F1 ATPase        NaN   
54                                           Protein Transport        NaN   
58                                  DNA maintenance and repair        NaN   
63                                                      Others        NaN   

In [18]:
original_data = original_data.loc[t,:]

In [19]:
original_data['data'] = -1

In [20]:
original_data.set_index('orf', inplace=True)

In [21]:
original_data = original_data[['data']].copy()

In [22]:
original_data = original_data.groupby(original_data.index).mean()

In [23]:
original_data.shape

(253, 1)

# Load & process tested strains

In [24]:
tested = pd.read_excel('raw_data/List of strains tested.xlsx', sheet_name='Tabelle2')

In [25]:
tested.head()

Unnamed: 0,ORF,slow growth?,Unnamed: 2
0,YAL068C,,
1,YAL067C,,
2,YAL066W,,Note: when a strain had slow growth in control...
3,YAL065C,,
4,YAL062W,,


In [26]:
tested['orf'] = tested['ORF'].astype(str)

In [27]:
tested['orf'] = clean_orf(tested['orf'])

In [28]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [29]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

Empty DataFrame
Columns: [ORF, slow growth?, Unnamed: 2, orf]
Index: []


In [30]:
tested_orfs = tested['orf'].unique()

In [31]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

['YBR011C',
 'YBR123C',
 'YBR132C',
 'YBR152W',
 'YBR193C',
 'YDL043C',
 'YFR029W',
 'YGR147C',
 'YHR065C',
 'YIL026C',
 'YIR011C',
 'YJL026W',
 'YKL059C',
 'YMR038C',
 'YMR203W',
 'YNL006W',
 'YNL007C',
 'YNL039W',
 'YNL232W',
 'YNL245C',
 'YNR026C',
 'YNR035C',
 'YPL117C',
 'YPR033C',
 'YPR104C']

In [32]:
len(missing)

25

In [33]:
# 25 ORFs eliminated from the hit list (the list contains hits from both the hap and the het collection, but the list of tested strains is only available for the hap collection)

In [34]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [35]:
data = original_data.copy()

In [36]:
dataset_ids = [155]
datasets = datasets.reindex(index=dataset_ids)

In [37]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [38]:
data.head()

dataset_id,155
data_type,value
orf,Unnamed: 1_level_2
YAL068C,0
YAL067C,0
YAL066W,0
YAL065C,0
YAL062W,0


## Subset to the genes currently in SGD

In [39]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 20


In [40]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,155
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
1869,YAL068C,0
61,YAL067C,0
60,YAL066W,0
1727,YAL065C,0
57,YAL062W,0


# Normalize

In [41]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [42]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [43]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,155,155
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
1869,YAL068C,0,0.0
61,YAL067C,0,0.0
60,YAL066W,0,0.0
1727,YAL065C,0,0.0
57,YAL062W,0,0.0


# Print out

In [44]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [45]:
from IO.save_data_to_db3 import *

In [46]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 19633105...
Inserting the new data...


100%|██████████| 1/1 [00:10<00:00, 10.00s/it]

Updating the data_modified_on field...



