In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 23378416
paper_name = 'zeidler_denfert_2013' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [4]:
datasets.set_index('pmid', inplace=True)

# Load & process the data

In [19]:
original_data = pd.read_excel('raw_data/Table1.xlsx', sheet_name='Sheet1')

In [20]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 45 x 5


In [21]:
original_data['orfs'] = original_data['ORF'].astype(str)

In [22]:
# Eliminate all white spaces & capitalize
original_data['orfs'] = clean_orf(original_data['orfs'])

In [23]:
# Translate to ORFs 
original_data['orfs'] = translate_sc(original_data['orfs'], to='orf')

In [24]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orfs'])
print(original_data.loc[~t,])

                                     ORF Name  Colistin  Aminocandin  \
index_input                                                            
0                        Glucansynthesis  NaN       NaN          NaN   
5                        Chitinsynthesis  NaN       NaN          NaN   
11                            PKCpathway  NaN       NaN          NaN   
14                 Sphingolipidsynthesis  NaN       NaN          NaN   
18                         Golgifunction  NaN       NaN          NaN   
26           Aromaticaminoacidmetabolism  NaN       NaN          NaN   
30                            Otherroles  NaN       NaN          NaN   

             Combined                         orfs  
index_input                                         
0                 NaN              GLUCANSYNTHESIS  
5                 NaN              CHITINSYNTHESIS  
11                NaN                   PKCPATHWAY  
14                NaN        SPHINGOLIPIDSYNTHESIS  
18                NaN            

In [25]:
original_data = original_data.loc[t,:]

In [26]:
original_data.set_index('orfs', inplace=True)
original_data.index.name = 'orf'

In [27]:
original_data = original_data[['Colistin','Aminocandin','Combined']].copy()

In [28]:
original_data['Colistin'] = pd.to_numeric(original_data['Colistin'], errors='coerce')
original_data['Aminocandin'] = pd.to_numeric(original_data['Aminocandin'], errors='coerce')
original_data['Combined'] = pd.to_numeric(original_data['Combined'], errors='coerce')

In [29]:
original_data = original_data.groupby(original_data.index).mean()

In [30]:
original_data.shape

(38, 3)

# Prepare the final dataset

In [31]:
data = original_data.copy()

In [32]:
dataset_ids = [16521,16519,16522]

In [33]:
datasets = datasets.reindex(index=dataset_ids)

In [34]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [35]:
data.head()

dataset_id,16521,16519,16522
data_type,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
YAL026C,-0.6,0.0,-1.3
YBL061C,0.2,-0.6,-2.6
YBL062W,-0.1,-0.6,-1.4
YBR023C,0.1,-0.6,-2.2
YBR036C,-1.4,0.0,-1.2


## Subset to the genes currently in SGD

In [36]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [37]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16521,16519,16522
Unnamed: 0_level_1,data_type,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
24,YAL026C,-0.6,0.0,-1.3
149,YBL061C,0.2,-0.6,-2.6
150,YBL062W,-0.1,-0.6,-1.4
219,YBR023C,0.1,-0.6,-2.2
232,YBR036C,-1.4,0.0,-1.2


# Normalize

In [38]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [39]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [40]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16521,16519,16522,16521,16519,16522
Unnamed: 0_level_1,data_type,value,value,value,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
24,YAL026C,-0.6,0.0,-1.3,-9.151691,0.112941,-10.349772
149,YBL061C,0.2,-0.6,-2.6,3.238291,-11.06822,-20.77243
150,YBL062W,-0.1,-0.6,-1.4,-1.407952,-11.06822,-11.151515
219,YBR023C,0.1,-0.6,-2.2,1.689543,-11.06822,-17.565458
232,YBR036C,-1.4,0.0,-1.2,-21.541672,0.112941,-9.548029


# Print out

In [41]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [42]:
from IO.save_data_to_db3 import *

In [43]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/3 [00:00<?, ?it/s]

Deleting all datasets for PMID 23378416...
Inserting the new data...


100%|██████████| 3/3 [00:00<00:00,  7.84it/s]

Updating the data_modified_on field...



