In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 21212869
paper_name = 'bleackley_macgillivray_2011' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/metallomicsbleackley raw data.xls', sheet_name='rawdata')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 24577 x 14


In [7]:
original_data.head()

Unnamed: 0,Orf,Gene,Fe,p-value,Cu,p-value.1,Mn,p-value.2,Ni,p-value.3,Zn,p-value.4,Co,p-value.5
0,undefined,undefined,,,,,,,,,,,,
1,YMR252C,YMR252C,0.866133,0.973444,1.02104,1.0,0.97292,1.0,1.03667,1.0,1.03562,1.0,1.02811,1.0
2,YLL040C,VPS13,0.711493,0.04727,1.13738,1.0,1.04188,1.0,1.39008,1.0,1.01973,1.0,0.891561,0.981706
3,YNL276C,YNL276C,0.963582,0.904782,1.00037,1.0,0.947105,0.881325,0.993056,1.0,0.814685,0.911056,1.14256,1.0
4,YAL068C,YAL068C,0.875378,0.617833,1.02895,1.0,0.938812,0.984705,0.969529,0.983776,1.00913,1.0,1.00164,1.0


In [8]:
original_data['orf'] = original_data['Orf'].astype(str)

In [9]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [10]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [11]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                   Orf       Gene Fe  p-value Cu  p-value.1 Mn  p-value.2 Ni  \
index_input                                                                    
0            undefined  undefined         NaN           NaN           NaN      
6            undefined  undefined         NaN           NaN           NaN      
17           undefined  undefined         NaN           NaN           NaN      
23           undefined  undefined         NaN           NaN           NaN      
80           undefined  undefined         NaN           NaN           NaN      
...                ...        ... ..      ... ..        ... ..        ... ..   
24572                                     NaN           NaN           NaN      
24573                                     NaN           NaN           NaN      
24574                                     NaN           NaN           NaN      
24575                                     NaN           NaN           NaN      
24576                                   

In [14]:
original_data = original_data.loc[t,:]

In [15]:
original_data.set_index('orf', inplace=True)

In [16]:
original_data = original_data[['Fe','Cu','Mn','Ni','Zn','Co']].copy()

In [17]:
original_data = original_data.apply(pd.to_numeric, axis=1, errors='coerce')

In [18]:
original_data = original_data.groupby(original_data.index).mean()

In [19]:
original_data.shape

(4765, 6)

# Prepare the final dataset

In [20]:
data = original_data.copy()

In [21]:
dataset_ids = np.arange(20,26)
datasets = datasets.reindex(index=dataset_ids)

In [22]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [23]:
data.head()

dataset_id,20,21,22,23,24,25
data_type,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
YAL002W,0.921386,0.839092,0.943714,1.166588,0.869552,0.461767
YAL004W,1.17934,1.031561,1.022911,0.919253,0.96303,1.142426
YAL005C,0.807115,0.954682,0.94625,1.038537,1.067878,0.784079
YAL007C,0.911574,0.929408,1.088321,0.980989,0.99668,0.921879
YAL008W,0.784962,1.026864,1.070975,0.898266,1.096596,0.980908


## Subset to the genes currently in SGD

In [24]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [25]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,20,21,22,23,24,25
Unnamed: 0_level_1,data_type,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2,YAL002W,0.921386,0.839092,0.943714,1.166588,0.869552,0.461767
1863,YAL004W,1.17934,1.031561,1.022911,0.919253,0.96303,1.142426
4,YAL005C,0.807115,0.954682,0.94625,1.038537,1.067878,0.784079
5,YAL007C,0.911574,0.929408,1.088321,0.980989,0.99668,0.921879
6,YAL008W,0.784962,1.026864,1.070975,0.898266,1.096596,0.980908


# Normalize

In [26]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [27]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [28]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,20,21,22,23,24,25,20,21,22,23,24,25
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2,YAL002W,0.921386,0.839092,0.943714,1.166588,0.869552,0.461767,0.050301,-0.777008,-0.477528,0.993947,-1.088117,-2.058424
1863,YAL004W,1.17934,1.031561,1.022911,0.919253,0.96303,1.142426,0.45501,0.214307,-0.089968,-0.362082,-0.457769,0.172551
4,YAL005C,0.807115,0.954682,0.94625,1.038537,1.067878,0.784079,-0.128981,-0.18166,-0.46512,0.291897,0.249261,-1.001992
5,YAL007C,0.911574,0.929408,1.088321,0.980989,0.99668,0.921879,0.034906,-0.311833,0.230126,-0.023613,-0.230852,-0.550329
6,YAL008W,0.784962,1.026864,1.070975,0.898266,1.096596,0.980908,-0.163737,0.190115,0.145242,-0.477144,0.442915,-0.356853


# Print out

In [29]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [30]:
from IO.save_data_to_db3 import *

In [31]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/6 [00:00<?, ?it/s]

Deleting all datasets for PMID 21212869...
Inserting the new data...


100%|██████████| 6/6 [00:40<00:00,  6.70s/it]

Updating the data_modified_on field...



