In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 18437200
paper_name = 'jin_freedman_2008' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
sheets = ['EC10','EC50']

In [10]:
data_cols

array(['Arsenic (400uM)', 'Cadmium (5uM)', 'Silver (10uM)',
       'Mercury (19uM)', 'Chromium (400uM)', 'Zinc (1000uM)',
       'Copper (5000uM)', 'gene'], dtype=object)

In [12]:
original_data_list = []
for s in sheets:
    original_data = pd.read_excel('raw_data/journal.pgen.1000053.s005.xlsx', sheet_name=s, skiprows=3)
    print('Original data dimensions: %d x %d' % (original_data.shape))
#     print(original_data.head())
    if 'NAME' in original_data.columns:
        original_data['gene'] = original_data['NAME'].astype(str)
    else:
        original_data['gene'] = original_data['Gene'].astype(str)
        
    original_data['gene'] = clean_genename(original_data['gene'])
    original_data['orf'] = translate_sc(original_data['gene'], to='orf')
    # Make sure everything translated ok
    t = looks_like_orf(original_data['orf'])
    print(original_data.loc[~t,])
    original_data.set_index('orf', inplace=True)
    
    data_cols = original_data.columns.values[2:9]
    original_data = original_data[data_cols].astype(float)
    
    original_data = original_data.groupby(original_data.index).mean()
    print(original_data.shape)
    
    original_data_list.append(original_data)

Original data dimensions: 90 x 9
Empty DataFrame
Columns: [Clustera, NAME, Arsenic (400uM), Cadmium (5uM), Silver (10uM), Mercury (19uM), Chromium (400uM), Zinc (1000uM), Copper (5000uM), gene, orf]
Index: []
(90, 7)
Original data dimensions: 540 x 9
Empty DataFrame
Columns: [Clustera, Gene, Arsenic (1250uM), Cadmium (25uM), Chromium (900uM), Copper (7000uM), Silver (20uM), Mercury (49uM), Zinc (2000uM), gene, orf]
Index: []
(539, 7)


In [13]:
original_data = pd.concat(original_data_list, axis=1)

In [16]:
original_data.index.name='orf'

In [17]:
original_data = (1 / original_data) - 1

In [30]:
original_data[original_data.isnull()] = 0

# Prepare the final dataset

In [31]:
data = original_data.copy()

In [32]:
dataset_ids = [11772, 11773, 11771, 11774, 11775, 11776, 11777, 1311, 1312, 1313, 1314, 1310, 1315, 1316]
datasets = datasets.reindex(index=dataset_ids)

In [33]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [34]:
data.head()

dataset_id,11772,11773,11771,11774,11775,11776,11777,1311,1312,1313,1314,1310,1315,1316
data_type,value,value,value,value,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
YAL012W,0.030568,0.128912,-0.052759,-0.061948,-0.954403,-0.018358,-0.239381,-0.503904,-0.815733,-0.868211,-0.924607,-0.047864,-0.000338,-0.042699
YAL016W,0.014505,-0.17719,-0.121006,-0.024299,-0.91298,-0.106769,-0.063839,-0.602243,-0.936471,-0.913125,-0.098362,-0.169925,-0.051531,-0.490825
YAL021C,0.008575,-0.039011,-0.07297,-0.011142,-0.56438,-0.063385,0.011301,0.247533,-0.907793,-0.629889,-0.715012,-0.071074,0.043905,-0.905748
YBR011C,0.05565,-0.104099,0.001907,0.010928,-0.935668,-0.986409,-0.088074,0.238407,-0.548456,-0.947749,0.204142,-0.077641,0.100046,-0.995591
YBR112C,-0.477994,-0.814469,0.023667,-0.206407,-0.598995,-0.1287,-0.148014,-0.981128,-0.95182,-0.672829,-0.600542,-0.163635,-0.247555,-0.133055


## Subset to the genes currently in SGD

In [35]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [36]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,11772,11773,11771,11774,11775,11776,11777,1311,1312,1313,1314,1310,1315,1316
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
10,YAL012W,0.030568,0.128912,-0.052759,-0.061948,-0.954403,-0.018358,-0.239381,-0.503904,-0.815733,-0.868211,-0.924607,-0.047864,-0.000338,-0.042699
14,YAL016W,0.014505,-0.17719,-0.121006,-0.024299,-0.91298,-0.106769,-0.063839,-0.602243,-0.936471,-0.913125,-0.098362,-0.169925,-0.051531,-0.490825
19,YAL021C,0.008575,-0.039011,-0.07297,-0.011142,-0.56438,-0.063385,0.011301,0.247533,-0.907793,-0.629889,-0.715012,-0.071074,0.043905,-0.905748
207,YBR011C,0.05565,-0.104099,0.001907,0.010928,-0.935668,-0.986409,-0.088074,0.238407,-0.548456,-0.947749,0.204142,-0.077641,0.100046,-0.995591
306,YBR112C,-0.477994,-0.814469,0.023667,-0.206407,-0.598995,-0.1287,-0.148014,-0.981128,-0.95182,-0.672829,-0.600542,-0.163635,-0.247555,-0.133055


# Normalize

In [37]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [38]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [39]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,11772,11773,11771,11774,11775,11776,11777,1311,1312,1313,...,11775,11776,11777,1311,1312,1313,1314,1310,1315,1316
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
10,YAL012W,0.030568,0.128912,-0.052759,-0.061948,-0.954403,-0.018358,-0.239381,-0.503904,-0.815733,-0.868211,...,-10.019988,-0.32439,-7.810466,-4.597506,-4.606698,-5.123882,-7.489111,-1.514897,-0.164315,-0.386936
14,YAL016W,0.014505,-0.17719,-0.121006,-0.024299,-0.91298,-0.106769,-0.063839,-0.602243,-0.936471,-0.913125,...,-9.584011,-1.644351,-2.200182,-5.485623,-5.29295,-5.38732,-0.818929,-5.138687,-1.432835,-4.121144
19,YAL021C,0.008575,-0.039011,-0.07297,-0.011142,-0.56438,-0.063385,0.011301,0.247533,-0.907793,-0.629889,...,-5.915032,-0.996632,0.201269,2.188873,-5.129948,-3.726034,-5.797078,-2.203968,0.931968,-7.578681
207,YBR011C,0.05565,-0.104099,0.001907,0.010928,-0.935668,-0.986409,-0.088074,0.238407,-0.548456,-0.947749,...,-9.822806,-14.777194,-2.974729,2.106454,-3.087543,-5.590402,1.623145,-2.398918,2.323094,-8.327337
306,YBR112C,-0.477994,-0.814469,0.023667,-0.206407,-0.598995,-0.1287,-0.148014,-0.981128,-0.95182,-0.672829,...,-6.279353,-1.971775,-4.890394,-8.90741,-5.38019,-3.977894,-4.872976,-4.951951,-6.29011,-1.139867


# Print out

In [40]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [41]:
from IO.save_data_to_db3 import *

In [42]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/14 [00:00<?, ?it/s]

Deleting all datasets for PMID 18437200...
Inserting the new data...


100%|██████████| 14/14 [00:11<00:00,  1.19it/s]

Updating the data_modified_on field...



