In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 28428821
paper_name = 'henriques_sa_correia_2017' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data (part 1)

In [40]:
original_data1 = pd.read_excel('raw_data/13068_2017_781_MOESM1_ESM.xlsx', sheet_name='Table S1', skiprows=2)
original_data1.head()

Unnamed: 0,Carbohydrate and energy metabolism,Unnamed: 1,Unnamed: 2
0,HXK2,Hexokinase isoenzyme 2 that catalyzes phosphor...,++
1,VID24,GID Complex regulatory subunit that binds the ...,+
2,TPS2,Phosphatase subunit of the trehalose-6-phospha...,++
3,ZWF1,"Glucose-6-phosphate dehydrogenase (G6PD), cata...",+
4,GPD2,NAD-dependent glycerol 3-phosphate dehydrogena...,+


In [41]:
print('Original data dimensions: %d x %d' % (original_data1.shape))

Original data dimensions: 182 x 3


In [42]:
original_data1['genes'] = original_data1.iloc[:,0].astype(str)

In [43]:
# Eliminate all white spaces & capitalize
original_data1['genes'] = clean_genename(original_data1['genes'])

In [44]:
# Translate to ORFs 
original_data1['orfs'] = translate_sc(original_data1['genes'], to='orf')

In [45]:
# Make sure everything translated ok
t = looks_like_orf(original_data1['orfs'])
print(original_data1.loc[~t,])

                            Carbohydrate and energy metabolism Unnamed: 1  \
index_input                                                                 
11                          Amino acids and vitamin metabolism        NaN   
24                                            Lipid metabolism        NaN   
32                     Cell wall and cytoskeleton organization        NaN   
47                                     Internal pH homeostasis        NaN   
51                                             Ion homeostasis        NaN   
59                                          Response to stress        NaN   
73           Chromatin remodelling, nucleic acid metabolism...        NaN   
102                                          Protein synthesis        NaN   
119               Intracellular traficking and protein sorting        NaN   
154                                           Unknown function        NaN   

            Unnamed: 2                                              genes  

In [46]:
original_data1 = original_data1.loc[t,:]

In [47]:
original_data1['data'] = original_data1.iloc[:,2].apply(lambda x: -len(x))

In [48]:
original_data1.set_index('orfs', inplace=True)
original_data1.index.name = 'orf'

In [49]:
original_data1 = original_data1[['data']].copy()

In [50]:
original_data1 = original_data1.groupby(original_data1.index).mean()

In [51]:
original_data1.shape

(172, 1)

# Load & process the data (part 2)

In [52]:
original_data2 = pd.read_excel('raw_data/13068_2017_781_MOESM2_ESM.xlsx', sheet_name='Table S2', skiprows=2)
original_data2.head()

Unnamed: 0,Carbohydrate and energy metabolism,Unnamed: 1
0,PTC6,Mitochondrial type 2C protein phosphatase (PP2...
1,CYC3,Cytochrome c heme lyase (holocytochrome c synt...
2,YMC2,"Mitochondrial protein, putative inner membrane..."
3,Amino acids metabolism,
4,HIS7,Imidazole glycerol phosphate synthase (glutami...


In [54]:
original_data2['genes'] = original_data2.iloc[:,0].astype(str)

In [55]:
# Eliminate all white spaces & capitalize
original_data2['genes'] = clean_genename(original_data2['genes'])

In [56]:
# Translate to ORFs 
original_data2['orfs'] = translate_sc(original_data2['genes'], to='orf')

In [57]:
## Make sure everything translated ok
t = looks_like_orf(original_data2['orfs'])
print(original_data2.loc[~t,])

                            Carbohydrate and energy metabolism Unnamed: 1  \
index_input                                                                 
3                                       Amino acids metabolism        NaN   
5                                              Ion homeostasis        NaN   
9                                           Response to stress        NaN   
11           Chromatin remodelling, nucleic acid metabolism...        NaN   
25                                           Protein synthesis        NaN   
29                Intracellular traficking and protein sorting        NaN   
37                                            Unknown function        NaN   

                                                         genes  \
index_input                                                      
3                                         AMINOACIDSMETABOLISM   
5                                               IONHOMEOSTASIS   
9                                         

In [58]:
original_data2 = original_data2.loc[t,:]

In [59]:
original_data2['data'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [60]:
original_data2.set_index('orfs', inplace=True)
original_data2.index.name = 'orf'

In [61]:
original_data2 = original_data2.groupby(original_data2.index).mean()

In [62]:
original_data2.shape

(41, 1)

# Merge

In [63]:
original_data = pd.concat((original_data1, original_data2), axis=0)

In [66]:
original_data = original_data.groupby(original_data.index).mean()

In [67]:
original_data.shape

(213, 1)

# Load & process tested strains

In [68]:
tested = pd.read_excel('raw_data/List of strains tested.xlsx', sheet_name='Tabelle2')

In [69]:
tested['orf'] = tested['ORF'].astype(str)

In [70]:
tested['orf'] = clean_orf(tested['orf'])

In [71]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [72]:
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

Empty DataFrame
Columns: [ORF, slow growth?, Unnamed: 2, orf]
Index: []


In [73]:
tested_orfs = np.unique(tested['orf'].values)

In [74]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

['YER087W']

In [75]:
tested_orfs = list(tested_orfs) + missing

In [76]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

In [77]:
original_data.shape

(4933, 1)

# Prepare the final dataset

In [78]:
data = original_data[['data']].copy()

In [79]:
dataset_ids = [16264]
datasets = datasets.reindex(index=dataset_ids)

In [80]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [81]:
data.head()

dataset_id,16264
data_type,value
orf,Unnamed: 1_level_2
YAL002W,-1
YAL004W,0
YAL005C,0
YAL007C,0
YAL008W,0


## Subset to the genes currently in SGD

In [82]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 20


In [83]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

In [84]:
data.head()

Unnamed: 0_level_0,dataset_id,16264
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,-1
1863,YAL004W,0
4,YAL005C,0
5,YAL007C,0
6,YAL008W,0


# Normalize

In [85]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [86]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [87]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

In [88]:
data_all.head()


Unnamed: 0_level_0,dataset_id,16264,16264
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,-1,-4.130258
1863,YAL004W,0,0.0
4,YAL005C,0,0.0
5,YAL007C,0,0.0
6,YAL008W,0,0.0


# Print out

In [89]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [90]:
from IO.save_data_to_db3 import *

In [91]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 28428821...
Inserting the new data...


100%|██████████| 1/1 [00:08<00:00,  8.24s/it]

Updating the data_modified_on field...



