In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 18471310
paper_name = 'endo_shima_2008' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [9]:
original_data = pd.read_excel('raw_data/13068_2007_3_MOESM1_ESM.xlsx', sheet_name='Sheet1', skiprows=2)

In [10]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 87 x 5


In [11]:
original_data.head()

Unnamed: 0,Unnamed: 1,ORF,Gene,Sensitivitya,Description
0,Metabolism,,,,
1,,YML115C,VAN1,0.25,Vanadate resistance protein
2,,YJR105W,ADO1,0.15,Adenosine kinase
3,,YKL211C,TRP3,0.14,Anthranilate synthase component II
4,,YLR056W,ERG3,0.2,C-5 sterol desaturase


In [12]:
original_data['orf'] = original_data['ORF'].astype(str)

In [13]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [14]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [15]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                             　  ORF Gene  \
index_input                                                                
0                                                   Metabolism  NaN  NaN   
13                               Cell cycle and DNA processing  NaN  NaN   
31                                               Transcription  NaN  NaN   
37                                           Protein synthesis  NaN  NaN   
44           Protein fate (folding, modification, destination)  NaN  NaN   
50           Cellular transport, transport facilitation and...  NaN  NaN   
69                                                      Others  NaN  NaN   
73                                       Unclassified proteins  NaN  NaN   
84           aValues of sensitivity were defined in methods...  NaN  NaN   
85           bThese genes are involved in chromatin remodel...  NaN  NaN   
86             cThese genes are involved in vesicle transport.  NaN  NaN   

           

In [16]:
original_data = original_data.loc[t,:]

In [17]:
original_data['data'] = original_data['Sensitivitya'].astype(float)

In [18]:
original_data.set_index('orf', inplace=True)

In [19]:
original_data = original_data[['data']].copy()

In [20]:
original_data = original_data.groupby(original_data.index).mean()

In [21]:
original_data.shape

(76, 1)

# Load & process tested strains

In [31]:
tested = pd.read_excel('raw_data/strainlist.xlsx', sheet_name='data')

In [32]:
tested.head()

Unnamed: 0,plate,row,col,record #,ORF,comments,slow growth?,Unnamed: 7,削除(replaced with),Unnamed: 9,ENTRY,Unnamed: 11,GENE,DESCRIPTION (MIPS),Unnamed: 14,4706全株リスト
0,00-1,A,1,,,,,,blank,,,,,,,
1,00-1,A,2,35714.0,YAL064C-A,,,,,,YAL064c-a,FUNCTIONAL UNCLASSIFIED PROTEINS,,"strong similarity to Flo1p, Flo5p, pseudogene",,YAL064C-A
2,00-1,A,3,,,,,,blank,,,,,,,
3,00-1,A,4,35716.0,YBL091C-A,,,,,,YBL091c-a,classified,SCS22,"Suppressor of Choline Sensitivity, homologous ...",,YBL091C-A
4,00-1,A,5,35717.0,YBR269C,,,,,,YBR269c,FUNCTIONAL UNCLASSIFIED PROTEINS,FMP21,Found in Mitochondrial Proteome,,YBR269C


In [33]:
tested['orf'] = tested['ORF'].astype(str)

In [34]:
tested['orf'] = clean_orf(tested['orf'])

In [35]:
tested.loc[tested['orf']=='YOR205CHOMDIP','orf'] = 'YOR205C'

In [36]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [37]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

                   plate row  col  record #  ORF comments slow growth?  \
index_input                                                              
0                   00-1   A    1       NaN  NaN      NaN          NaN   
2                   00-1   A    3       NaN  NaN      NaN          NaN   
5                   00-1   A    6       NaN  NaN      NaN          NaN   
7                   00-1   A    8       NaN  NaN      NaN          NaN   
10                  00-1   A   11       NaN  NaN      NaN          NaN   
...                  ...  ..  ...       ...  ...      ...          ...   
7195         Replacement   H    8       NaN  NaN      NaN          NaN   
7196         Replacement   H    9       NaN  NaN      NaN          NaN   
7197         Replacement   H   10       NaN  NaN      NaN          NaN   
7198         Replacement   H   11       NaN  NaN      NaN          NaN   
7199         Replacement   H   12       NaN  NaN      NaN          NaN   

             Unnamed: 7 削除(replaced w

In [38]:
tested = tested.loc[t,:]

In [39]:
tested_orfs = tested['orf'].unique()

In [40]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

[]

In [41]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

In [42]:
original_data.shape

(4688, 1)

# Prepare the final dataset

In [43]:
data = original_data.copy()

In [44]:
dataset_ids = [11827]
datasets = datasets.reindex(index=dataset_ids)

In [45]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [46]:
data.head()

dataset_id,11827
data_type,value
orf,Unnamed: 1_level_2
YAL064C-A,0.0
YBL091C-A,0.0
YBR269C,0.0
YBR271W,0.0
YBR273C,0.0


## Subset to the genes currently in SGD

In [47]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [48]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,11827
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
1867,YAL064C-A,0.0
6108,YBL091C-A,0.0
463,YBR269C,0.0
465,YBR271W,0.0
467,YBR273C,0.0


# Normalize

In [49]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [50]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [51]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,11827,11827
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
1867,YAL064C-A,0.0,0.0
6108,YBL091C-A,0.0,0.0
463,YBR269C,0.0,0.0
465,YBR271W,0.0,0.0
467,YBR273C,0.0,0.0


# Print out

In [52]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [53]:
from IO.save_data_to_db3 import *

In [54]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 18471310...
Inserting the new data...


100%|██████████| 1/1 [00:08<00:00,  8.23s/it]

Updating the data_modified_on field...



