In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 20973990
paper_name = 'mira_sa_correia_2010' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [44]:
original_data = pd.read_excel('raw_data/1475-2859-9-79-s1.xlsx', sheet_name='Sheet1', skiprows=4)

In [45]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 692 x 4


In [46]:
original_data.head()

Unnamed: 0,Gene,Function1,Susceptibility to acetic acid2,Unnamed: 3
0,,,,
1,Lipid metabolism,,,
2,,,,
3,CHO2,Phosphatidylethanolamine methyltransferase (PE...,+,
4,CRD1,"Cardiolipin synthase; produces cardiolipin, wh...",++,


In [47]:
original_data['gene'] = original_data['Gene'].astype(str)

In [48]:
# Eliminate all white spaces & capitalize
original_data['gene'] = clean_genename(original_data['gene'])

In [49]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['gene'], to='orf')

In [50]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                          Gene Function1  \
index_input                                                                
0                                                          NaN       NaN   
1                                             Lipid metabolism       NaN   
2                                                          NaN       NaN   
20                                                         NaN       NaN   
21                                          Response to stress       NaN   
29                                                         NaN       NaN   
30                                      Mitochondrial function       NaN   
31                                                         NaN       NaN   
71                                                         NaN       NaN   
72           Chromatin remodelling, nucleic acid metabolism...       NaN   
171                                                        NaN       NaN   
172         

In [51]:
original_data = original_data.loc[t,:]

In [52]:
original_data['data'] = original_data['Susceptibility to acetic acid2'].apply(lambda x: -len(x) if isinstance(x, str) else 0)

In [53]:
original_data.set_index('orf', inplace=True)

In [54]:
original_data = original_data[['data']].copy()

In [55]:
original_data = original_data.groupby(original_data.index).mean()

In [56]:
original_data.shape

(625, 1)

# Load & process tested strains

In [57]:
tested = pd.read_excel('raw_data/List of strains tested.xlsx', sheet_name='Tabelle2')

In [58]:
tested.head()

Unnamed: 0,ORF,slow growth?,Unnamed: 2
0,YAL068C,,
1,YAL067C,,
2,YAL066W,,Note: when a strain had slow growth in control...
3,YAL065C,,
4,YAL062W,,


In [59]:
tested['orf'] = tested['ORF'].astype(str)

In [60]:
tested['orf'] = clean_orf(tested['orf'])

In [61]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [62]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

Empty DataFrame
Columns: [ORF, slow growth?, Unnamed: 2, orf]
Index: []


In [63]:
tested_orfs = tested['orf'].unique()

In [64]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

['YBR011C',
 'YBR132C',
 'YBR253W',
 'YDR168W',
 'YDR271C',
 'YER044C',
 'YER087W',
 'YMR038C',
 'YMR213W']

In [65]:
tested_orfs = list(tested_orfs) + missing

In [66]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [67]:
data = original_data.copy()

In [68]:
dataset_ids = [101]
datasets = datasets.reindex(index=dataset_ids)

In [69]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [70]:
data.head()

dataset_id,101
data_type,value
orf,Unnamed: 1_level_2
YAL068C,0.0
YAL067C,0.0
YAL066W,0.0
YAL065C,0.0
YAL062W,0.0


## Subset to the genes currently in SGD

In [71]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 20


In [72]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,101
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
1869,YAL068C,0.0
61,YAL067C,0.0
60,YAL066W,0.0
1727,YAL065C,0.0
57,YAL062W,0.0


# Normalize

In [73]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [74]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [75]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,101,101
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
1869,YAL068C,0.0,0.0
61,YAL067C,0.0,0.0
60,YAL066W,0.0,0.0
1727,YAL065C,0.0,0.0
57,YAL062W,0.0,0.0


# Print out

In [76]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [77]:
from IO.save_data_to_db3 import *

In [78]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 20973990...
Inserting the new data...


100%|██████████| 1/1 [00:08<00:00,  8.21s/it]

Updating the data_modified_on field...



