In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 17873082
paper_name = 'botet_santos_2007' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [51]:
original_data = pd.read_excel('raw_data/1ScreenSULFA&MS&MS+PABA.xlsx', sheet_name='DATA')

In [52]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 7270 x 41


In [53]:
original_data.head()

Unnamed: 0,Record,Strain,Screen,N Screening,Plate,Chrom.,Row,Col.,Pos.,ORF,...,"SULFA 0,1 mg/ml 120h","SULFA 0,2 mg/ml 120h",SMM 77h.2,SMM+PABA 2 µg/ml 77h,"SULFA 0,1 mg/ml 77h.2","SULFA 0,1 mg/ml+PABA 2 µg/ml 77h",SMM 122h,SMM+PABA 2 µg/ml 122h,"SULFA 0,1 mg/ml 122h","SULFA 0,1 mg/ml+PABA 2 µg/ml 122h"
0,,,S2,1.0,1.0,chr00_1,A,1.0,A01,,...,,,,,,,,,,
1,15714.0,BY4742,S2,2.0,1.0,chr00_1,A,2.0,A02,YAL064C-A,...,,,,,,,,,,
2,,,S2,3.0,1.0,chr00_1,A,3.0,A03,,...,,,,,,,,,,
3,15716.0,BY4742,S2,4.0,1.0,chr00_1,A,4.0,A04,YBL091C-A,...,,,,,,,,,,
4,15717.0,BY4742,S2,5.0,1.0,chr00_1,A,5.0,A05,YBR269C,...,,,,,,,,,,


In [54]:
original_data['orf'] = original_data['ORF'].astype(str)

In [55]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [56]:
original_data.loc[original_data['orf']=='YER050','orf'] = 'YER050C'

In [57]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [58]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

             Record Strain Screen  N Screening  Plate   Chrom.  Row  Col.  \
index_input                                                                 
0               NaN    NaN     S2          1.0    1.0  chr00_1    A   1.0   
2               NaN    NaN     S2          3.0    1.0  chr00_1    A   3.0   
5               NaN    NaN     S2          6.0    1.0  chr00_1    A   6.0   
7               NaN    NaN     S2          8.0    1.0  chr00_1    A   8.0   
10              1.0    NaN     S2         11.0    1.0  chr00_1    A  11.0   
...             ...    ...    ...          ...    ...      ...  ...   ...   
7265            NaN    NaN    NaN          NaN    NaN      NaN  NaN   NaN   
7266            NaN    NaN    NaN          NaN    NaN      NaN  NaN   NaN   
7267            NaN    NaN    NaN          NaN    NaN      NaN  NaN   NaN   
7268            NaN    NaN    NaN          NaN    NaN      NaN  NaN   NaN   
7269            NaN    NaN    NaN          NaN    NaN      NaN  NaN   NaN   

In [59]:
original_data = original_data.loc[t,:]

In [60]:
col_data = ['SMM 77h', 'SMM 120h', 'SULFA 0,1 mg/ml 77h', 'SULFA 0,1 mg/ml  120h']

In [61]:
original_data.set_index('orf', inplace=True)

In [62]:
original_data = original_data[col_data].copy()

In [63]:
original_data = original_data.apply(pd.to_numeric, axis=1, errors='coerce')

In [64]:
original_data = original_data.groupby(original_data.index).mean()

In [65]:
original_data.shape

(4754, 4)

In [66]:
original_data.head()

Unnamed: 0_level_0,SMM 77h,SMM 120h,"SULFA 0,1 mg/ml 77h","SULFA 0,1 mg/ml 120h"
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
YAL002W,0.445,0.459,0.488,0.498
YAL004W,0.45,0.479,0.525,0.554
YAL005C,0.442,0.439,0.545,0.569
YAL007C,0.35,0.343,0.321,0.413
YAL008W,0.452,0.47,0.503,0.536


In [67]:
original_data['data1'] = original_data['SULFA 0,1 mg/ml 77h'] / original_data['SMM 77h']
original_data['data2'] = original_data['SULFA 0,1 mg/ml  120h'] / original_data['SMM 120h']

In [68]:
original_data = original_data[['data1','data2']].copy()

# Prepare the final dataset

In [69]:
data = original_data.copy()

In [70]:
dataset_ids = [137, 244]
datasets = datasets.reindex(index=dataset_ids)

In [71]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [72]:
data.head()

dataset_id,137,244
data_type,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2
YAL002W,1.096629,1.084967
YAL004W,1.166667,1.156576
YAL005C,1.233032,1.296128
YAL007C,0.917143,1.204082
YAL008W,1.112832,1.140426


## Subset to the genes currently in SGD

In [73]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 23


In [74]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,137,244
Unnamed: 0_level_1,data_type,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,1.096629,1.084967
1863,YAL004W,1.166667,1.156576
4,YAL005C,1.233032,1.296128
5,YAL007C,0.917143,1.204082
6,YAL008W,1.112832,1.140426


# Normalize

In [75]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [76]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [77]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,137,244,137,244
Unnamed: 0_level_1,data_type,value,value,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2,YAL002W,1.096629,1.084967,0.213427,-0.245443
1863,YAL004W,1.166667,1.156576,0.561938,0.227904
4,YAL005C,1.233032,1.296128,0.892174,1.150362
5,YAL007C,0.917143,1.204082,-0.679707,0.541923
6,YAL008W,1.112832,1.140426,0.294053,0.121145


# Print out

In [78]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [79]:
from IO.save_data_to_db3 import *

In [80]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/2 [00:00<?, ?it/s]

Deleting all datasets for PMID 17873082...
Inserting the new data...


100%|██████████| 2/2 [00:13<00:00,  6.87s/it]

Updating the data_modified_on field...



