In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 18688276
paper_name = 'ericson_nislow_2008' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [109]:
original_data = pd.read_excel('raw_data/pgen.1000151.s003.xlsx', sheet_name='Sheet1', skiprows=2)

In [110]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 5226 x 95


In [111]:
original_data.head()

Unnamed: 0,ORF,Gene,6-nitroquipazine maleate,Fluvoxamine maleate,Zardaverine,Imetit dihydrobromide,Iodophenpropit dihydrobromide,Salmeterol,SR 59230A hydrochloride,LY 367385,...,Clomipramine hydrochloride.1,SKF 83822 hydrobromide,SKF 83822 hydrobromide.1,Description,Feature_qualifier,GO_process,GO_function,GO_component,Essential_gene,Zygosity
0,YAL001C,TFC3,-0.808898,-1.32501,-0.670831,-0.325576,-0.252302,-0.172921,0.010748,-0.58001,...,-0.759544,-0.098509,-0.396676,Largest of six subunits of the RNA polymerase ...,Verified,transcription initiation from RNA polymerase I...,RNA polymerase III transcription factor activity,mitochondrion*,yes,het
1,YAL002W,VPS8,-1.09763,-0.770822,-0.991552,-1.19722,-0.793416,-0.761811,-0.301568,-0.779557,...,0.489359,0.066836,0.101663,Membrane-associated hydrophilic protein that i...,Verified,late endosome to vacuole transport,molecular function unknown,membrane fraction,no,hom
2,YAL003W,EFB1,-0.483805,-0.335716,-0.450622,-0.182541,-0.434479,-0.781484,-0.781484,-0.536665,...,0.606146,0.58229,0.838908,Translation elongation factor 1 beta; stimulat...,Verified,translational elongation,translation elongation factor activity,ribosome*,yes,het
3,YAL005C,SSA1,-0.047399,-0.094895,0.004581,-0.028918,-0.035101,-0.108385,-0.159151,-0.112678,...,0.0075,0.081721,0.089917,ATPase involved in protein folding and nuclear...,Verified,translation*,ATPase activity*,cytoplasm*,no,hom
4,YAL007C,ERP2,0.115854,-0.014314,-0.075712,0.204233,0.017717,0.219656,-0.094251,0.104052,...,-0.358478,-0.077984,-0.193587,Protein that forms a heterotrimeric complex wi...,Verified,ER to Golgi vesicle-mediated transport,molecular function unknown,ER to Golgi transport vesicle,no,hom


In [112]:
original_data['orf'] = original_data['ORF'].astype(str)

In [113]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [114]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [115]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [ORF, Gene, 6-nitroquipazine maleate, Fluvoxamine maleate, Zardaverine, Imetit dihydrobromide, Iodophenpropit dihydrobromide, Salmeterol, SR 59230A hydrochloride, LY 367385, MPEP hydrochloride, CGP 78608 hydrochloride, (S)-MCPG, 7-chlorokynurenic acid, GBR 13069 dihydrochloride, GBR 12909 dihydrochloride, GBR 12783 dihydrochloride, GBR 12935 dihydrochloride, (RS)-(+/-)-Sulpiride, 3'-Fluorobenzylspiperone maleate, A 77636 hydrochloride, 3a-Bis-(4-fluorophenyl) methoxytropane hydrochloride, Haloperidol hydrochloride, Bromocriptine mesylate, AMI-193, (+)-AJ 76 hydrochloride, BTCP, 7-Hydroxy-PIPAT maleate, (S)-(-)-sulpiride, Metergoline, Parthenolide, 5-Nonyloxytryptamine oxalate, Cyproheptadine hydrochloride, GR 127935 hydrochloride, NAN 190 hydrobromide, CGS 12066B dimaleate, Paroxetine maleate, WAY 629, Altanserin hydrochloride, Sertraline hydrochloride, Ritanserin, Carmoxirole hydrochloride, SCH 39166 hydrobromide, L-741,741 hydrochloride, CY 208-243, L741,742 

In [116]:
original_data.set_index('orf', inplace=True)

In [117]:
data_cols = original_data.columns[np.arange(2,88)]

In [118]:
original_data1 = original_data.loc[original_data['Zygosity']=='hom',data_cols].copy()
original_data2 = original_data.loc[original_data['Zygosity']=='het',data_cols].copy()

In [119]:
original_data1 = original_data1.apply(pd.to_numeric, axis=1, errors='coerce')
original_data2 = original_data2.apply(pd.to_numeric, axis=1, errors='coerce')

In [120]:
original_data1 = original_data1.groupby(original_data1.index).mean()
original_data2 = original_data2.groupby(original_data2.index).mean()

In [121]:
original_data1.shape

(4201, 86)

In [122]:
original_data2.shape

(1000, 86)

# Load dataset info

In [123]:
original_data1.columns = [c.split('.')[0] for c in original_data1.columns.values]
original_data2.columns = [c.split('.')[0] for c in original_data2.columns.values]

In [124]:
df = pd.read_excel('extras/drug_dataset.xlsx', sheet_name='Sheet1')
df.set_index('Unnamed: 0', inplace=True)

In [125]:
df.drop_duplicates(subset=['Hom dataset','Het dataset'], inplace=True)
df.shape

(81, 4)

In [126]:
hom_dataset_id = [df.loc[c,'Hom dataset'] for c in original_data1.columns.values]
het_dataset_id = [df.loc[c,'Het dataset'] for c in original_data1.columns.values]

In [127]:
original_data1.columns = hom_dataset_id
original_data2.columns = het_dataset_id

In [128]:
original_data1 = original_data1.T
original_data1 = original_data1.groupby(original_data1.index).mean()
original_data1 = original_data1.T
original_data1.shape

(4201, 81)

In [129]:
original_data2 = original_data2.T
original_data2 = original_data2.groupby(original_data2.index).mean()
original_data2 = original_data2.T
original_data2.shape

(1000, 81)

In [130]:
original_data = original_data1.join(original_data2, how='outer')

In [131]:
original_data.head()

Unnamed: 0_level_0,509,5009,5010,5011,5012,5013,5014,5015,5016,5017,...,5160,5161,5162,5163,5164,5165,5166,5167,5168,5169
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YAL001C,,,,,,,,,,,...,0.382196,0.540964,-0.247592,0.270842,0.010748,0.886951,0.025027,-1.95364,-0.56697,-0.670831
YAL002W,1.084844,0.770241,0.714035,0.307677,0.60043,-0.728671,1.333645,0.977858,1.31404,1.89311,...,,,,,,,,,,
YAL003W,,,,,,,,,,,...,0.30576,-0.017437,0.710599,0.589144,-0.781484,0.096067,-1.63228,-2.38105,0.223168,-0.450622
YAL004W,0.585719,0.861604,0.545551,-0.031649,0.128335,-0.071324,0.668568,0.277145,0.034361,0.899422,...,,,,,,,,,,
YAL005C,0.052817,0.000286,0.035713,0.011217,0.106191,-0.035101,-0.021304,0.081318,-0.012155,0.016566,...,,,,,,,,,,


In [132]:
# Taking the opposite because the original data is log2(ctrl/treatment)
original_data = -original_data

# Prepare the final dataset

In [133]:
data = original_data.copy()

In [134]:
dataset_ids = original_data.columns.values
datasets = datasets.reindex(index=dataset_ids)

In [135]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [136]:
data.head()

dataset_id,509,5009,5010,5011,5012,5013,5014,5015,5016,5017,...,5160,5161,5162,5163,5164,5165,5166,5167,5168,5169
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL001C,,,,,,,,,,,...,-0.382196,-0.540964,0.247592,-0.270842,-0.010748,-0.886951,-0.025027,1.95364,0.56697,0.670831
YAL002W,-1.084844,-0.770241,-0.714035,-0.307677,-0.60043,0.728671,-1.333645,-0.977858,-1.31404,-1.89311,...,,,,,,,,,,
YAL003W,,,,,,,,,,,...,-0.30576,0.017437,-0.710599,-0.589144,0.781484,-0.096067,1.63228,2.38105,-0.223168,0.450622
YAL004W,-0.585719,-0.861604,-0.545551,0.031649,-0.128335,0.071324,-0.668568,-0.277145,-0.034361,-0.899422,...,,,,,,,,,,
YAL005C,-0.052817,-0.000286,-0.035713,-0.011217,-0.106191,0.035101,0.021304,-0.081318,0.012155,-0.016566,...,,,,,,,,,,


## Subset to the genes currently in SGD

In [137]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 3


In [138]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,509,5009,5010,5011,5012,5013,5014,5015,5016,5017,...,5160,5161,5162,5163,5164,5165,5166,5167,5168,5169
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,,,,,,,,,,,...,-0.382196,-0.540964,0.247592,-0.270842,-0.010748,-0.886951,-0.025027,1.95364,0.56697,0.670831
2,YAL002W,-1.084844,-0.770241,-0.714035,-0.307677,-0.60043,0.728671,-1.333645,-0.977858,-1.31404,-1.89311,...,,,,,,,,,,
3,YAL003W,,,,,,,,,,,...,-0.30576,0.017437,-0.710599,-0.589144,0.781484,-0.096067,1.63228,2.38105,-0.223168,0.450622
1863,YAL004W,-0.585719,-0.861604,-0.545551,0.031649,-0.128335,0.071324,-0.668568,-0.277145,-0.034361,-0.899422,...,,,,,,,,,,
4,YAL005C,-0.052817,-0.000286,-0.035713,-0.011217,-0.106191,0.035101,0.021304,-0.081318,0.012155,-0.016566,...,,,,,,,,,,


# Normalize

In [139]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [140]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [141]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,509,5009,5010,5011,5012,5013,5014,5015,5016,5017,...,5160,5161,5162,5163,5164,5165,5166,5167,5168,5169
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,,,,,,,,,,,...,-1.050786,-1.516537,0.658254,-0.471059,0.004237,-2.487809,0.032597,1.232247,1.775887,1.244189
2,YAL002W,-1.084844,-0.770241,-0.714035,-0.307677,-0.60043,0.728671,-1.333645,-0.977858,-1.31404,-1.89311,...,,,,,,,,,,
3,YAL003W,,,,,,,,,,,...,-0.813571,0.149839,-1.381854,-1.209329,0.954479,-0.350222,2.461696,1.476978,-0.650938,0.800455
1863,YAL004W,-0.585719,-0.861604,-0.545551,0.031649,-0.128335,0.071324,-0.668568,-0.277145,-0.034361,-0.899422,...,,,,,,,,,,
4,YAL005C,-0.052817,-0.000286,-0.035713,-0.011217,-0.106191,0.035101,0.021304,-0.081318,0.012155,-0.016566,...,,,,,,,,,,


# Print out

In [142]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [143]:
from IO.save_data_to_db3 import *

In [144]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 18688276...


  0%|          | 0/162 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 162/162 [20:30<00:00,  7.59s/it]

Updating the data_modified_on field...



