In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 20421943
paper_name = 'matecic_smith_2010' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [6]:
sheet_names = ['DN-tag signal ratio, log2 ranks','UP-tag signal ratio, log2 ranks']

In [9]:
original_data.columns

Index(['ORF', 'GENE', '(d9/d1) 2%', '(d21/d1) 2%', '(d33/d1) 2%', 'Unnamed: 5',
       '(d9/d1) 0.5%', '(d21/d1) 0.5%', '(d33/d1) 0.5%', 'Unnamed: 9',
       '(d9/d1) 2%.1', '(d21/d1) 2%.1', '(d33/d1) 2%.1', 'Unnamed: 13',
       '(d9/d1) 0.5%.1', '(d21/d1) 0.5%.1', '(d33/d1) 0.5%.1', 'Unnamed: 17',
       'ORF.1', 'GENE.1', '(d9/d1) 2%.2', '(d21/d1) 2%.2', '(d33/d1) 2%.2',
       'Unnamed: 23', '(d9/d1) 0.5%.2', '(d21/d1) 0.5%.2', '(d33/d1) 0.5%.2'],
      dtype='object')

In [11]:
original_data_list = []
for s in sheet_names:
    original_data = pd.read_excel('raw_data/journal.pgen.1000921.s002.xlsx', sheet_name=s, skiprows=2)
    print('Original data dimensions: %d x %d' % (original_data.shape))
#     print(original_data.head())
    original_data['orf'] = original_data['ORF'].astype(str)
    original_data['orf'] = clean_orf(original_data['orf'])
    original_data['orf'] = translate_sc(original_data['orf'], to='orf')
    t = looks_like_orf(original_data['orf'])
    print(original_data.loc[~t,])
    original_data = original_data.loc[t,:]
    original_data.set_index('orf', inplace=True)
    
    # Excluding Day33 on 2% because (unlike other datasets) UP and DN tags show inconsistent results. The average seems to be meaningless for most strains.
    original_data = original_data[['(d9/d1) 2%.1', '(d21/d1) 2%.1','(d9/d1) 0.5%.1', '(d21/d1) 0.5%.1', '(d33/d1) 0.5%.1']]
    original_data = original_data.apply(pd.to_numeric, axis=1, errors='coerce')
    original_data = original_data.groupby(original_data.index).mean()
    print(original_data.shape)
    original_data_list.append(original_data)

Original data dimensions: 2699 x 27
             ORF GENE  (d9/d1) 2%  (d21/d1) 2%  (d33/d1) 2%  Unnamed: 5  \
index_input                                                               
0            NaN  NaN         NaN          NaN          NaN         NaN   

             (d9/d1) 0.5%  (d21/d1) 0.5%  (d33/d1) 0.5%  Unnamed: 9  ...  \
index_input                                                          ...   
0                     NaN            NaN            NaN         NaN  ...   

             ORF.1  GENE.1  D9 2%  D21 2%  D33 2%  Unnamed: 23  D9 0.5%  \
index_input                                                               
0              NaN     NaN    NaN     NaN     NaN          NaN      NaN   

             D21 0.5% D33 0.5%  orf  
index_input                          
0                 NaN      NaN  NAN  

[1 rows x 28 columns]
(2654, 5)
Original data dimensions: 3479 x 27
             ORF GENE  (d9/d1) 2%  (d21/d1) 2%  (d33/d1) 2%  Unnamed: 5  \
index_input              

In [18]:
original_data = pd.concat(original_data_list, axis=0)

In [19]:
original_data.head()

Unnamed: 0_level_0,(d9/d1) 2%.1,(d21/d1) 2%.1,(d9/d1) 0.5%.1,(d21/d1) 0.5%.1,(d33/d1) 0.5%.1
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
YAL012W,-0.202026,1.032526,-0.400359,1.073145,0.614606
YAL024C,0.368843,-0.335718,-0.016153,-0.493526,-2.113458
YAL049C,0.087702,-1.368829,-1.479619,-0.513816,-1.104796
YAL054C,0.269325,0.384789,0.23483,-2.081109,-6.148392
YAL056C-A,-0.483965,-0.330431,0.101902,0.101598,-2.736966


In [20]:
# Average UP and DN tags
original_data = original_data.groupby(original_data.index).mean()

In [21]:
original_data.head()

Unnamed: 0_level_0,(d9/d1) 2%.1,(d21/d1) 2%.1,(d9/d1) 0.5%.1,(d21/d1) 0.5%.1,(d33/d1) 0.5%.1
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
YAL002W,-0.663079,-2.636625,-2.893052,-2.072377,-2.947533
YAL004W,-0.973443,-3.299888,-0.447756,-1.763872,-3.775156
YAL005C,-0.451959,-1.992544,-0.394764,-1.229691,-4.904059
YAL007C,0.218488,0.100929,-0.085496,-0.082342,-2.096215
YAL008W,0.655704,0.080652,0.396308,0.055863,-2.391714


In [22]:
original_data.shape

(3417, 5)

# Prepare the final dataset

In [23]:
data = original_data.copy()

In [24]:
dataset_ids = [4712, 5354, 5356, 5357, 5358]
datasets = datasets.reindex(index=dataset_ids)

In [25]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [26]:
data.head()

dataset_id,4712,5354,5356,5357,5358
data_type,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
YAL002W,-0.663079,-2.636625,-2.893052,-2.072377,-2.947533
YAL004W,-0.973443,-3.299888,-0.447756,-1.763872,-3.775156
YAL005C,-0.451959,-1.992544,-0.394764,-1.229691,-4.904059
YAL007C,0.218488,0.100929,-0.085496,-0.082342,-2.096215
YAL008W,0.655704,0.080652,0.396308,0.055863,-2.391714


## Subset to the genes currently in SGD

In [27]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 19


In [28]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,4712,5354,5356,5357,5358
Unnamed: 0_level_1,data_type,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2,YAL002W,-0.663079,-2.636625,-2.893052,-2.072377,-2.947533
1863,YAL004W,-0.973443,-3.299888,-0.447756,-1.763872,-3.775156
4,YAL005C,-0.451959,-1.992544,-0.394764,-1.229691,-4.904059
5,YAL007C,0.218488,0.100929,-0.085496,-0.082342,-2.096215
6,YAL008W,0.655704,0.080652,0.396308,0.055863,-2.391714


# Normalize

In [29]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [30]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [31]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,4712,5354,5356,5357,5358,4712,5354,5356,5357,5358
Unnamed: 0_level_1,data_type,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
2,YAL002W,-0.663079,-2.636625,-2.893052,-2.072377,-2.947533,-0.672408,-1.193695,-3.080222,-1.169173,-0.131407
1863,YAL004W,-0.973443,-3.299888,-0.447756,-1.763872,-3.775156,-0.891656,-1.524357,-0.561812,-0.974653,-0.628103
4,YAL005C,-0.451959,-1.992544,-0.394764,-1.229691,-4.904059,-0.523268,-0.872597,-0.507235,-0.637838,-1.305611
5,YAL007C,0.218488,0.100929,-0.085496,-0.082342,-2.096215,-0.049649,0.17108,-0.18872,0.085596,0.379509
6,YAL008W,0.655704,0.080652,0.396308,0.055863,-2.391714,0.259211,0.160971,0.307489,0.172738,0.202167


# Print out

In [32]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [33]:
from IO.save_data_to_db3 import *

In [34]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/5 [00:00<?, ?it/s]

Deleting all datasets for PMID 20421943...
Inserting the new data...


100%|██████████| 5/5 [00:29<00:00,  5.83s/it]

Updating the data_modified_on field...



