In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 26267134
paper_name = 'costa_texeira_2015' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [4]:
datasets.set_index('pmid', inplace=True)

# Load & process the data

In [44]:
original_data = pd.read_excel('raw_data/journal.pone.0135110.s002.XLSX',sheet_name='Resistance determinants', skiprows=4, header=None)

In [45]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 203 x 4


In [46]:
original_data.head()

Unnamed: 0,0,1,2,3
0,,YLL043W,FPS1,"Plasma membrane glycerol channel, member of th..."
1,,YGR240C,PFK1,Alpha subunit of heterooctameric phosphofructo...
2,,YFL033C,RIM15,Glucose-repressible protein kinase involved in...
3,,YGR192C,TDH3,"Glyceraldehyde-3-phosphate dehydrogenase, isoz..."
4,,YMR261C,TPS3,Regulatory subunit of trehalose-6-phosphate sy...


In [47]:
original_data['orfs'] = original_data[1].astype(str)

In [48]:
# Eliminate all white spaces & capitalize
original_data['orfs'] = clean_orf(original_data['orfs'])

In [49]:
# Translate to ORFs 
original_data['orfs'] = translate_sc(original_data['orfs'], to='orf')

In [50]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orfs'])
print(original_data.loc[~t,])

              0                                                  1    2    3  \
index_input                                                                    
7           NaN                          Oxidative phosphorylation  NaN  NaN   
12          NaN              Mitochondrial biogenesis and function  NaN  NaN   
15          NaN  Peroxisome biogenesis and fatty acid beta-oxid...  NaN  NaN   
19          NaN  Nitrogen metabolism (ammonium, amino acid, pol...  NaN  NaN   
37          NaN                               Phosphate metabolism  NaN  NaN   
40          NaN             Ergosterol and phospholipid metabolism  NaN  NaN   
49          NaN                Cell wall biogenesis and remodeling  NaN  NaN   
58          NaN                                      Ion transport  NaN  NaN   
64          NaN     Stress response (osmotic, oxidative, alkaline)  NaN  NaN   
71          NaN                                         Cell cycle  NaN  NaN   
87          NaN                         

In [51]:
original_data = original_data.loc[t,]

In [52]:
original_data['data'] = 1

In [53]:
original_data.set_index('orfs', inplace=True)
original_data.index.name='orf'

In [54]:
original_data = original_data[['data']].copy()

In [55]:
original_data = original_data.groupby(original_data.index).mean()

In [56]:
original_data.shape

(184, 1)

# Load & process tested strains

In [57]:
tested1 = pd.read_excel('raw_data/BY4741-1stDelivery.xls', sheet_name='Tabelle1')
tested2 = pd.read_excel('raw_data/BY4741-2nd Delivery.xls', sheet_name='chr11_1yes')
tested3 = pd.read_excel('raw_data/BY4741-3rd Delivery.xls', sheet_name='Tabelle1')


In [58]:
tested = pd.concat((tested1['ORF'], tested2['ORF'], tested3['orf']), axis=0).to_frame()

In [59]:
tested[0] = clean_orf(tested[0])

In [60]:
tested.drop_duplicates(inplace=True, ignore_index=True)

In [61]:
tested.head()

Unnamed: 0,0
0,YAL068C
1,YAL067C
2,YAL066W
3,YAL065C
4,YAL062W


In [62]:
tested[0] = translate_sc(tested[0], to='orf')

In [63]:
# Make sure everything translated ok
t = looks_like_orf(tested[0])
print(tested.loc[~t,])

Empty DataFrame
Columns: [0]
Index: []


In [64]:
tested_orfs = np.unique(tested[0].values)

In [65]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

[]

In [66]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [67]:
data = original_data.copy()

In [68]:
dataset_ids = [16462]
datasets = datasets.reindex(index=dataset_ids)

In [69]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [70]:
data.head()

dataset_id,16462
data_type,value
orf,Unnamed: 1_level_2
YAL002W,0
YAL004W,0
YAL005C,0
YAL007C,0
YAL008W,0


## Subset to the genes currently in SGD

In [71]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 23


In [72]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16462
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,0
1863,YAL004W,0
4,YAL005C,0
5,YAL007C,0
6,YAL008W,0


# Normalize

In [73]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [74]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [75]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16462,16462
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,0,0.0
1863,YAL004W,0,0.0
4,YAL005C,0,0.0
5,YAL007C,0,0.0
6,YAL008W,0,0.0


# Print out

In [76]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [77]:
from IO.save_data_to_db3 import *

In [78]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 26267134...
Inserting the new data...


100%|██████████| 1/1 [00:08<00:00,  9.00s/it]

Updating the data_modified_on field...



