In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 22244311
paper_name = 'pir_oliver_2012' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/12918_2012_852_MOESM3_ESM.xlsx', sheet_name='data')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 5603 x 22


In [7]:
original_data.head()

Unnamed: 0,ORF,C01.FCC',C01.P,C01.FDR,N01.FCC',N01.P,N01.FDR,P01.FCC',P01.P,P01.FDR,...,N02.FDR,N03.FCC',N03.P,N03.FDR,FPM03.FCC',FPM03.P,FPM03.FDR,FPMTURB.FCC',FPMTURB.P,FPMTURB.FDR
0,YAL001C,9.6e-05,0.91526,0.944105,-0.001095,0.065967,0.109748,0.00017,0.835859,0.864944,...,0.141287,-0.000196,0.734214,0.839357,-0.000378,0.650849,0.801506,-0.006045,0.01021981,0.02371797
1,YAL002W,-0.001993,0.026023,0.068446,-0.003648,0.000111,0.00061,-0.001709,0.019267,0.03664,...,0.382266,0.001871,0.0035,0.027094,0.000887,0.037925,0.117223,-0.010963,1.965168e-05,0.0001158353
2,YAL003W,-0.000943,0.22925,0.352757,-0.003003,0.003925,0.010802,0.001522,0.318443,0.384209,...,0.746753,-0.000987,0.153352,0.311051,-0.000441,0.562403,0.736396,-0.007982,0.0009259972,0.003151266
3,YAL004W,-0.002382,0.007998,0.02776,-0.003644,2.9e-05,0.000213,-0.004601,3e-06,2.2e-05,...,1e-05,0.000551,0.124345,0.268653,5e-06,0.989529,0.994539,-0.002847,0.09238832,0.1501247
4,YAL005C,-0.013759,0.003949,0.016252,-0.013101,2.7e-05,0.0002,-0.006521,0.013591,0.027155,...,0.004946,0.000161,0.901788,0.948002,-0.003349,0.096879,0.23028,-0.045376,4.379848e-12,6.803134e-11


In [8]:
original_data['orf'] = original_data['ORF'].astype(str)

In [9]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [10]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [11]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [ORF, C01.FCC', C01.P, C01.FDR, N01.FCC', N01.P, N01.FDR, P01.FCC', P01.P, P01.FDR, N02.FCC', N02.P, N02.FDR, N03.FCC', N03.P, N03.FDR, FPM03.FCC', FPM03.P, FPM03.FDR, FPMTURB.FCC', FPMTURB.P, FPMTURB.FDR, orf]
Index: []

[0 rows x 23 columns]


In [12]:
original_data.set_index('orf', inplace=True)

In [13]:
data_cols = [c for c in original_data.columns.values if '.FCC' in c]

In [15]:
original_data = original_data[data_cols].copy()

In [16]:
original_data = original_data.groupby(original_data.index).mean()

In [17]:
original_data.shape

(5580, 7)

# Prepare the final dataset

In [18]:
data = original_data.copy()

In [19]:
dataset_ids = [11859, 11860, 11861, 15991, 15992, 15995, 15996]
datasets = datasets.reindex(index=dataset_ids)

In [20]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [21]:
data.head()

dataset_id,11859,11860,11861,15991,15992,15995,15996
data_type,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
YAL001C,9.6e-05,-0.001095,0.00017,0.003133,-0.000196,-0.000378,-0.006045
YAL002W,-0.001993,-0.003648,-0.001709,0.002064,0.001871,0.000887,-0.010963
YAL003W,-0.000943,-0.003003,0.001522,0.000586,-0.000987,-0.000441,-0.007982
YAL004W,-0.002382,-0.003644,-0.004601,-0.007209,0.000551,5e-06,-0.002847
YAL005C,-0.013759,-0.013101,-0.006521,-0.019683,0.000161,-0.003349,-0.045376


## Subset to the genes currently in SGD

In [22]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 23


In [23]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,11859,11860,11861,15991,15992,15995,15996
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,YAL001C,9.6e-05,-0.001095,0.00017,0.003133,-0.000196,-0.000378,-0.006045
2,YAL002W,-0.001993,-0.003648,-0.001709,0.002064,0.001871,0.000887,-0.010963
3,YAL003W,-0.000943,-0.003003,0.001522,0.000586,-0.000987,-0.000441,-0.007982
1863,YAL004W,-0.002382,-0.003644,-0.004601,-0.007209,0.000551,5e-06,-0.002847
4,YAL005C,-0.013759,-0.013101,-0.006521,-0.019683,0.000161,-0.003349,-0.045376


# Normalize

In [24]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [25]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [26]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,11859,11860,11861,15991,15992,15995,15996,11859,11860,11861,15991,15992,15995,15996
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
1,YAL001C,9.6e-05,-0.001095,0.00017,0.003133,-0.000196,-0.000378,-0.006045,-0.038939,-0.780892,0.333107,0.487558,0.009411,-0.246361,-0.745007
2,YAL002W,-0.001993,-0.003648,-0.001709,0.002064,0.001871,0.000887,-0.010963,-0.907388,-1.905312,-0.299871,0.205069,1.399358,0.273883,-1.563454
3,YAL003W,-0.000943,-0.003003,0.001522,0.000586,-0.000987,-0.000441,-0.007982,-0.470698,-1.621269,0.788807,-0.185024,-0.522747,-0.272237,-1.067468
1863,YAL004W,-0.002382,-0.003644,-0.004601,-0.007209,0.000551,5e-06,-0.002847,-1.068837,-1.903842,-1.274368,-2.243035,0.511675,-0.088775,-0.212773
4,YAL005C,-0.013759,-0.013101,-0.006521,-0.019683,0.000161,-0.003349,-0.045376,-5.7991,-6.069234,-1.921552,-5.536533,0.249319,-1.468565,-7.290446


# Print out

In [27]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [28]:
from IO.save_data_to_db3 import *

In [29]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 22244311...


  0%|          | 0/7 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 7/7 [01:00<00:00,  8.59s/it]

Updating the data_modified_on field...



