In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 22384326
paper_name = 'kloimwieder_winston_2011' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/TableS1.xlsx', sheet_name='Table 1', skiprows=2)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 37 x 11


In [10]:
original_data.head()

Unnamed: 0.1,Unnamed: 0,Systematic \nname,Unnamed: 2,Common \nname,Unnamed: 5,Systematic \nname .1,Unnamed: 6,Common \nname .1,.1,Systematic \nname .2,Common \nname .2
0,,YAL026C,,DRS2,,YGL084C,,GUP1,,YML013C‐A,
1,,YAL056W,,GPB2,,YGL212W,,VAM7,,YML013W,SEL1
2,,YBL058W,,SHP1,,YGL246C,,RAI1,,YML014W,TRM9
3,,YBL083C,,,,YGR162W,,,,YMR077C,VPS20
4,,YBR106W,,,,YGR240C,,PFK1,,YMR125W,STO1


In [13]:
orfs = pd.concat([original_data.iloc[:,1], 
                  original_data.iloc[:,4],
                  original_data.iloc[:,7]], axis=0)

In [18]:
original_data = pd.DataFrame(data={'orf': orfs})

In [20]:
original_data['orf'] = original_data['orf'].astype(str)

In [21]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [23]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'].values, to='orf')

In [24]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

             orf
index_input     
32              
33           NAN
34           NAN
35           NAN
36           NAN
0               
1               
2               
3               
4               
5               
6               
7               
8               
9               
10              
11              
12              
13              
14              
15              
16              
17              
18              
19              
20              
21              
22              
23              
24              
25              
26              
27              
28              
29              
30              
31              
32              
33           NAN
34           NAN
35           NAN
36           NAN
3               
5               
10              
17              
20              
33           NAN
34           NAN
35           NAN
36           NAN


In [25]:
original_data = original_data.loc[t,:]

In [26]:
original_data['data'] = -1

In [27]:
original_data.set_index('orf', inplace=True)

In [28]:
original_data = original_data[['data']].copy()

In [29]:
original_data = original_data.groupby(original_data.index).mean()

In [30]:
original_data.shape

(60, 1)

# Load & process tested strains

In [39]:
tested = pd.read_excel('raw_data/Homo_diploids_101501.xlsx', sheet_name='Homo_diploids_101501.txt', skiprows=1)

In [40]:
tested.head()

Unnamed: 0,record no.,ORF name,Name,Other names,strain,batch,plate,row,column,RG NOTES,QC NOTES
0,30338.0,YAL068C,,,HOM DIP,01_1,301.0,A,2,,
1,30339.0,YAL067C,SEO1,,HOM DIP,01_1,301.0,A,3,,
2,30340.0,YAL066W,,,HOM DIP,01_1,301.0,A,4,,
3,30341.0,YAL065C,,,HOM DIP,01_1,301.0,A,5,,
4,30345.0,YAL062W,GDH3,FUN51,HOM DIP,01_1,301.0,A,6,,


In [41]:
tested['orf'] = tested['ORF name'].astype(str)

In [42]:
tested['orf'] = clean_orf(tested['orf'])

In [43]:
tested.loc[tested['orf']=='YELOO1C','orf'] = 'YEL001C'

In [44]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [45]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

             record no. ORF name Name Other names   strain batch  plate  row  \
index_input                                                                    
346             30827.0   YMR41W  NaN         NaN  HOM DIP  13_5  304.0    F   
3449                NaN      NaN  NaN         NaN      NaN   NaN    NaN  NaN   
4492                NaN      NaN  NaN         NaN      NaN   NaN    NaN  NaN   
4646                NaN      NaN  NaN         NaN      NaN   NaN    NaN  NaN   
4713                NaN      NaN  NaN         NaN      NaN   NaN    NaN  NaN   
4785                NaN      NaN  NaN         NaN      NaN   NaN    NaN  NaN   
4797                NaN      NaN  NaN         NaN      NaN   NaN    NaN  NaN   

              column RG NOTES QC NOTES     orf  
index_input                                     
346                6      NaN      NaN  YMR41W  
3449         End 337      NaN      NaN     NAN  
4492         end 349      NaN      NaN     NAN  
4646         end 371      NaN     

In [46]:
tested = tested.loc[t,:]

In [47]:
tested_orfs = tested['orf'].unique()

In [48]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

[]

In [49]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [50]:
data = original_data.copy()

In [51]:
dataset_ids = [16136]
datasets = datasets.reindex(index=dataset_ids)

In [52]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [53]:
data.head()

dataset_id,16136
data_type,value
orf,Unnamed: 1_level_2
YAL068C,0
YAL067C,0
YAL066W,0
YAL065C,0
YAL062W,0


## Subset to the genes currently in SGD

In [54]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [55]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16136
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
1869,YAL068C,0
61,YAL067C,0
60,YAL066W,0
1727,YAL065C,0
57,YAL062W,0


# Normalize

In [56]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [57]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [58]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16136,16136
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
1869,YAL068C,0,0.0
61,YAL067C,0,0.0
60,YAL066W,0,0.0
1727,YAL065C,0,0.0
57,YAL062W,0,0.0


# Print out

In [59]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [60]:
from IO.save_data_to_db3 import *

In [61]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 22384326...
Inserting the new data...


100%|██████████| 1/1 [00:08<00:00,  8.31s/it]

Updating the data_modified_on field...



