In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 31904504
paper_name = 'zhao_deng_2020' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [4]:
datasets.set_index('pmid', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/TableS2.xlsx', sheet_name='Sheet1', skiprows=2)

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 160 x 4


In [7]:
original_data.columns = ['orf','gene','cfu','spot']

In [8]:
original_data['orf'] = original_data['orf'].astype(str)

In [9]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [10]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [11]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                           orf gene     cfu  \
index_input                                                                   
16                                 CELLCYCLEANDDNAPROCESSING20  NaN     NaN   
37                                             TRANSCRIPTION22  NaN     NaN   
60           PROTEINSYNTHESISFOLDINGMODIFICATIONANDDESTINAT...  NaN     NaN   
84           PROTEINWITHBINDINGFUNCTIONORCOFACTORREQUIREMEN...  NaN     NaN   
98                                         CELLULARTRANSPORT41  NaN     NaN   
138                                     UNCLASSIFIEDPROTEINS20  NaN     NaN   
159                                                        NAN   WT  0.7911   

             spot  
index_input        
16            NaN  
37            NaN  
60            NaN  
84            NaN  
98            NaN  
138           NaN  
159           1.0  


In [12]:
original_data = original_data.loc[t,:]

In [13]:
original_data['data'] = -1

In [14]:
original_data.set_index('orf', inplace=True)

In [15]:
original_data = original_data.groupby(original_data.index).mean()

# Prepare the final dataset

In [16]:
data = original_data[['data']].copy()

In [17]:
dataset_ids = [16433]
datasets = datasets.reindex(index=dataset_ids)

In [18]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [19]:
print('Final data dimensions: %d x %d' % (data.shape))

Final data dimensions: 153 x 1


## Subset to the genes currently in SGD

In [20]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [21]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

# Normalize

In [22]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [23]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [24]:
data_norm[data.isnull()] = np.nan

In [25]:
data_all = data.join(data_norm)

# Print out

In [26]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [27]:
from IO.save_data_to_db3 import *

In [28]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 31904504...
Inserting the new data...


100%|██████████| 1/1 [00:00<00:00,  2.76it/s]

Updating the data_modified_on field...



