In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 16121259
paper_name = 'lee_giaever_2005' 

In [46]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [47]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [18]:
original_data = pd.read_excel('raw_data/pgen.0010024.sd001.xlsx', sheet_name='OrfGeneData')

In [19]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4758 x 115


In [20]:
original_data.head()

Unnamed: 0,ORF,Batch,Gene,GO_process,GO_function,GO_component,deletion_phenotype,result_np:04_02_24_05:cisplatin:500:uM::::hom_09_02:5,significant_np:04_02_24_05:cisplatin:500:uM::::hom_09_02:5,rank_np:04_02_24_05:cisplatin:500:uM::::hom_09_02:5,...,rank_np:04_05_12_10:camptothecin:30:ug/ml::::hom_09_02:20,result_np:04_07_16_08:camptothecin:30000:ug/ml::::hom_09_02:20,significant_np:04_07_16_08:camptothecin:30000:ug/ml::::hom_09_02:20,rank_np:04_07_16_08:camptothecin:30000:ug/ml::::hom_09_02:20,result_np:04_08_03_07:4nqo:0.0313:uM::::hom_09_02:5,significant_np:04_08_03_07:4nqo:0.0313:uM::::hom_09_02:5,rank_np:04_08_03_07:4nqo:0.0313:uM::::hom_09_02:5,result_np:04_08_05_01:4nqo:0.0313:uM::::hom_09_02:5,significant_np:04_08_05_01:4nqo:0.0313:uM::::hom_09_02:5,rank_np:04_08_05_01:4nqo:0.0313:uM::::hom_09_02:5
0,YAL002W,chr1_1,VPS8,late endosome to vacuole transport,molecular_function unknown,membrane fraction,viable,0.116074,0,2175,...,376,-0.008221,0,2065,0.25625,0,1242,0.786972,0,294
1,YAL004W,chr1_1,YAL004W,,,,viable,-0.064984,0,3170,...,929,-0.448587,0,4338,0.231566,0,1322,0.475449,0,650
2,YAL005C,chr1_1,SSA1,protein folding*,unfolded protein binding*,cytoplasm*,viable,0.096918,0,2277,...,2984,0.002291,0,1471,1.09786,0,143,0.669276,0,413
3,YAL007C,chr1_1,ERP2,ER to Golgi transport,molecular_function unknown,COPII-coated vesicle,viable,0.316698,0,1033,...,1426,-0.138989,0,2785,0.466131,0,730,0.803323,0,283
4,YAL008W,chr1_1,FUN14,biological_process unknown,molecular_function unknown,mitochondrion,viable,0.406762,0,692,...,1161,-0.31752,0,3880,0.400567,0,851,0.778996,0,306


In [21]:
original_data['orf'] = original_data['ORF'].astype(str)

In [22]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [23]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [24]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [ORF, Batch, Gene, GO_process, GO_function, GO_component, deletion_phenotype, result_np:04_02_24_05:cisplatin:500:uM::::hom_09_02:5, significant_np:04_02_24_05:cisplatin:500:uM::::hom_09_02:5, rank_np:04_02_24_05:cisplatin:500:uM::::hom_09_02:5, result_np:04_03_17_06:cisplatin:500:uM::::hom_09_02:5, significant_np:04_03_17_06:cisplatin:500:uM::::hom_09_02:5, rank_np:04_03_17_06:cisplatin:500:uM::::hom_09_02:5, result_np:03_04_04_03:cisplatin:500:uM::::hom_05_01:20, significant_np:03_04_04_03:cisplatin:500:uM::::hom_05_01:20, rank_np:03_04_04_03:cisplatin:500:uM::::hom_05_01:20, result_np:03_04_04_04:cisplatin:500:uM::::hom_05_01:20, significant_np:03_04_04_04:cisplatin:500:uM::::hom_05_01:20, rank_np:03_04_04_04:cisplatin:500:uM::::hom_05_01:20, result_np:04_03_09_01:carboplatin:15000:uM::::hom_09_02:5, significant_np:04_03_09_01:carboplatin:15000:uM::::hom_09_02:5, rank_np:04_03_09_01:carboplatin:15000:uM::::hom_09_02:5, result_np:04_03_30_02:carboplatin:15:mM

In [25]:
data_cols = [c for c in original_data.columns.values if c.startswith('result_np')]
data_cols

['result_np:04_02_24_05:cisplatin:500:uM::::hom_09_02:5',
 'result_np:04_03_17_06:cisplatin:500:uM::::hom_09_02:5',
 'result_np:03_04_04_03:cisplatin:500:uM::::hom_05_01:20',
 'result_np:03_04_04_04:cisplatin:500:uM::::hom_05_01:20',
 'result_np:04_03_09_01:carboplatin:15000:uM::::hom_09_02:5',
 'result_np:04_03_30_02:carboplatin:15:mM::::hom_09_02:5',
 'result_np:04_02_24_07:oxaliplatin:4000:uM::::hom_09_02:5',
 'result_np:04_03_17_07:oxaliplatin:4000:uM::::hom_09_02:5',
 'result_np:04_03_17_10:psoralen irradiated:0.5:uM::::hom_09_02:5',
 'result_np:04_03_25_05:psoralen irradiated:0.5:uM::::hom_09_02:5',
 'result_np:04_03_17_12:angelicin irradiated:62.5:uM::::hom_09_02:5',
 'result_np:04_03_25_06:angelicin irradiated:62.5:uM::::hom_09_02:5',
 'result_np:03_12_09_16:mechlorethamine:62.5:uM::::hom_09_02:5',
 'result_np:03_12_19_01:mechlorethamine:62.5:uM::::hom_09_02:5',
 'result_np:04_01_21_09:mechlorethamine:62.5:uM::::hom_09_02:5',
 'result_np:02_12_18_13:mechlorethamine:62.5:uM::::h

In [26]:
original_data.set_index('orf', inplace=True)

In [27]:
original_data = -original_data[data_cols].apply(pd.to_numeric, axis=1, errors='coerce')

In [28]:
original_data = original_data.groupby(original_data.index).mean()

In [29]:
original_data.shape

(4717, 36)

# Load dataset_ids

In [32]:
dt = pd.read_csv('extras/screen_datasetids.txt', sep='\t', header=None)

In [33]:
dt.head()

Unnamed: 0,0,1
0,result_np:04_02_24_05:cisplatin:500:uM::::hom_...,494
1,result_np:04_03_17_06:cisplatin:500:uM::::hom_...,494
2,result_np:03_04_04_03:cisplatin:500:uM::::hom_...,4964
3,result_np:03_04_04_04:cisplatin:500:uM::::hom_...,4964
4,result_np:04_03_09_01:carboplatin:15000:uM::::...,4965


In [34]:
dt.set_index(0, inplace=True)
dt = dt.reindex(index=original_data.columns.values)

In [36]:
dataset_ids = dt[1].values

In [41]:
original_data.columns = dataset_ids

In [42]:
original_data = original_data.T
original_data = original_data.groupby(original_data.index).mean()
original_data = original_data.T

# Prepare the final dataset

In [48]:
data = original_data.copy()

In [49]:
dataset_ids = original_data.columns.values
datasets = datasets.reindex(index=dataset_ids)

In [50]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [51]:
data.head()

dataset_id,494,4964,4965,4966,4968,4969,4970,4971,4972,4973,4974,4975,4976,4977,4978,4979
data_type,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
YAL002W,-0.338666,0.6947,-0.824313,-1.200845,-0.747015,-0.496401,-0.359019,0.736572,-0.969445,-0.488517,-0.028722,-0.201861,-0.51262,-0.195746,0.090528,-0.521611
YAL004W,0.032969,0.43037,-0.2826,-0.110506,-0.195279,-0.290994,-0.104367,0.215729,-0.209674,-0.333938,0.238522,-0.359463,-0.243652,0.150169,0.141267,-0.353507
YAL005C,-0.046967,0.510715,-0.210452,-0.163509,0.214631,-0.28561,-0.102062,0.504963,-0.374931,0.245477,0.190685,-0.293942,-0.055244,0.044331,0.381204,-0.883568
YAL007C,-0.219635,0.473252,-0.085367,-0.2167,-0.133014,-0.315444,-0.107625,0.606121,-0.376927,-0.088237,0.449832,-0.465453,-0.162218,0.043169,0.120192,-0.634727
YAL008W,-0.360553,0.232798,-0.409533,-0.268421,-0.556293,-0.306052,0.035683,-0.109149,-0.290878,-0.608148,0.466965,-0.741321,-0.435253,0.109985,-0.442898,-0.589781


## Subset to the genes currently in SGD

In [52]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [53]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,494,4964,4965,4966,4968,4969,4970,4971,4972,4973,4974,4975,4976,4977,4978,4979
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
2,YAL002W,-0.338666,0.6947,-0.824313,-1.200845,-0.747015,-0.496401,-0.359019,0.736572,-0.969445,-0.488517,-0.028722,-0.201861,-0.51262,-0.195746,0.090528,-0.521611
1863,YAL004W,0.032969,0.43037,-0.2826,-0.110506,-0.195279,-0.290994,-0.104367,0.215729,-0.209674,-0.333938,0.238522,-0.359463,-0.243652,0.150169,0.141267,-0.353507
4,YAL005C,-0.046967,0.510715,-0.210452,-0.163509,0.214631,-0.28561,-0.102062,0.504963,-0.374931,0.245477,0.190685,-0.293942,-0.055244,0.044331,0.381204,-0.883568
5,YAL007C,-0.219635,0.473252,-0.085367,-0.2167,-0.133014,-0.315444,-0.107625,0.606121,-0.376927,-0.088237,0.449832,-0.465453,-0.162218,0.043169,0.120192,-0.634727
6,YAL008W,-0.360553,0.232798,-0.409533,-0.268421,-0.556293,-0.306052,0.035683,-0.109149,-0.290878,-0.608148,0.466965,-0.741321,-0.435253,0.109985,-0.442898,-0.589781


# Normalize

In [54]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [55]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [56]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,494,4964,4965,4966,4968,4969,4970,4971,4972,4973,...,4970,4971,4972,4973,4974,4975,4976,4977,4978,4979
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,-0.338666,0.6947,-0.824313,-1.200845,-0.747015,-0.496401,-0.359019,0.736572,-0.969445,-0.488517,...,-0.704055,1.521954,-1.090261,-1.43705,-0.223454,0.049393,-1.065767,-0.609945,-0.006,-0.898849
1863,YAL004W,0.032969,0.43037,-0.2826,-0.110506,-0.195279,-0.290994,-0.104367,0.215729,-0.209674,-0.333938,...,-0.19798,0.379559,0.04246,-1.017946,0.441389,-0.284242,-0.362688,0.359577,0.097012,-0.601291
4,YAL005C,-0.046967,0.510715,-0.210452,-0.163509,0.214631,-0.28561,-0.102062,0.504963,-0.374931,0.245477,...,-0.1934,1.013951,-0.203917,0.552999,0.32238,-0.145538,0.129806,0.062936,0.584149,-1.539545
5,YAL007C,-0.219635,0.473252,-0.085367,-0.2167,-0.133014,-0.315444,-0.107625,0.606121,-0.376927,-0.088237,...,-0.204455,1.235827,-0.206893,-0.351786,0.967079,-0.508616,-0.149823,0.059681,0.054226,-1.099075
6,YAL008W,-0.360553,0.232798,-0.409533,-0.268421,-0.556293,-0.306052,0.035683,-0.109149,-0.290878,-0.608148,...,0.080343,-0.333013,-0.078605,-1.761401,1.009702,-1.092615,-0.86353,0.24695,-1.088998,-1.019517


# Print out

In [57]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [58]:
from IO.save_data_to_db3 import *

In [59]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 16121259...


  0%|          | 0/16 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 16/16 [01:57<00:00,  7.37s/it]

Updating the data_modified_on field...



