In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 19043571
paper_name = 'yu_bellaoui_2008' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [6]:
original_data = pd.read_excel('raw_data/13_15_data.xlsx', sheet_name='13&15 diploid (Figure 2)')

In [7]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 5968 x 11


In [8]:
original_data.head()

Unnamed: 0,strain,Gene,z_result_nq:07_04_18_t01:cpmd13:0.98:ug/ml::::5_20:heho_05_06:YPD,z_result_nq:07_04_18_t02:cmpd15:1:ug/ml::::5_20:heho_05_06:YPD,Description,feature_qualifier,GO_process,GO_function,GO_component,essential_gene,zygosity
0,YBL074C::chr2_1,AAR2,0.75536,-0.222357,"Component of the U5 snRNP, required for splici...",Verified,assembly of spliceosomal tri-snRNP,molecular function unknown,snRNP U5,yes,het
1,YBR236C::chr2_4,ABD1,-0.331585,-0.787958,"Methyltransferase, catalyzes the transfer of a...",Verified,mRNA capping,mRNA (guanine-N7-)-methyltransferase activity,nucleus*,yes,het
2,YKL112W::chr11_2,ABF1,4.56314,-0.503318,DNA binding protein with possible chromatin-re...,Verified,positive regulation of transcription from RNA ...,DNA binding*,nucleus*,yes,het
3,YNR016C::chr14_4,ACC1,2.05882,2.09788,"Acetyl-CoA carboxylase, biotin containing enzy...",Verified,protein import into nucleus*,acetyl-CoA carboxylase activity*,mitochondrion*,yes,het
4,YKL192C::chr11_3,ACP1,0.507806,0.100593,"Mitochondrial matrix acyl carrier protein, inv...",Verified,fatty acid biosynthetic process,acyl carrier activity,mitochondrion,yes,het


In [9]:
original_data['orf'] = original_data['strain'].apply(lambda x: x.split(':')[0])

In [10]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [11]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [12]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [strain, Gene, z_result_nq:07_04_18_t01:cpmd13:0.98:ug/ml::::5_20:heho_05_06:YPD, z_result_nq:07_04_18_t02:cmpd15:1:ug/ml::::5_20:heho_05_06:YPD, Description, feature_qualifier, GO_process, GO_function, GO_component, essential_gene, zygosity, orf]
Index: []


In [13]:
# Split het and hom
original_data1 = original_data.loc[original_data['zygosity']=='hom'].copy()
original_data2 = original_data.loc[original_data['zygosity']=='het'].copy()

In [14]:
original_data1.set_index('orf', inplace=True)
original_data2.set_index('orf', inplace=True)

In [15]:
data_cols = ['z_result_nq:07_04_18_t01:cpmd13:0.98:ug/ml::::5_20:heho_05_06:YPD',
             'z_result_nq:07_04_18_t02:cmpd15:1:ug/ml::::5_20:heho_05_06:YPD']
original_data1 = original_data1[data_cols].copy()
original_data2 = original_data2[data_cols].copy()

In [16]:
original_data1 = original_data1.apply(pd.to_numeric, axis=1, errors='coerce')
original_data2 = original_data2.apply(pd.to_numeric, axis=1, errors='coerce')

In [17]:
original_data1 = original_data1.groupby(original_data1.index).mean()
original_data2 = original_data2.groupby(original_data2.index).mean()

In [18]:
print(original_data1.shape)
print(original_data2.shape)

(4716, 2)
(1142, 2)


# Load (haploid)

In [19]:
original_data3 = pd.read_excel('raw_data/HAP compendium May 12 19 2006 including compounds 13 and 15-without badtags.xlsx', 
                               sheet_name='Sheet1')
print('Original data dimensions: %d x %d' % (original_data3.shape))

Original data dimensions: 3425 x 98


In [21]:
original_data3.head()

Unnamed: 0,strain,15Cmpd,13Cmpd,Lichen 3exp2,Lichen 18exp2,Fungal 1822,Fungal 1868,Fungal 1840exp2,Fungal 1807,Fungal 1799,...,Cytochalasin A,CG4-Theopalauamide,Caspofungin,Camptothecin,Basiliskamide,192A4-Stichloroside,Agelasine E,Fluconazole,Geldanamycin,Mibefradil
0,YER014C-A,0.0,2.88563,0.896489,0.0,0.0,1.761845,2.328814,3.096375,0.589179,...,0.969175,0.0,1.172075,1.30135,1.294775,0.0,1.1574,0.0,0.89695,0.0
1,YDR484W,0.0,0.0,1.228097,0.924564,0.0,1.424993,1.057255,1.36146,0.656819,...,1.8792,0.891675,1.090725,0.0,0.563225,1.069125,0.0,1.17515,1.262675,0.0
2,YIR033W,0.0,2.793995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.8393,0.0,0.7337,0.0,0.758825,0.0,1.2472,0.0,0.0,0.0
3,YLR262C,0.0,0.0,1.279586,0.569186,0.0,1.454935,1.295387,1.526788,0.687485,...,0.509425,0.90565,1.175,0.0,0.97955,0.0,0.0,0.0,0.861975,0.0
4,YGL071W,1.176245,0.0,0.0,0.0,0.802201,0.0,0.0,1.063624,0.0,...,0.5173,0.6806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
original_data3['orf'] = original_data3['strain'].astype(str)

In [23]:
original_data3['orf'] = clean_orf(original_data3['orf'])

In [24]:
original_data3['orf'] = translate_sc(original_data3['orf'], to='orf')

In [25]:
t = looks_like_orf(original_data3['orf'])
print(original_data3.loc[~t,])

Empty DataFrame
Columns: [strain, 15Cmpd, 13Cmpd, Lichen 3exp2, Lichen 18exp2, Fungal 1822, Fungal 1868, Fungal 1840exp2, Fungal 1807, Fungal 1799, Haloperidol , Dyclonine , Lichen 11, Lichen 5, Myriocin, Lichen 1, Concanamycin, Vulpinic acid, Valproic acid 2, Metavanadate, Propranolol 373803, Pap B old p 44K, Flufenamic acid, Fendiline, Ro peptide B, Sulfometuron methyl , MMS , Clotrimazole , Benomyl , Plumbagin , Hydroxyurea , Artemisinin , Amantadine hydrochloride , 4-Hydroxytamoxifen , Usnic acid , Sodium Azide , Nystatin , Neomycin sulfate , Caffeine , Menthol , Verrucarin , Valinomycin , Trifluoroperazine , Tamoxifen , Raloxifene , Pentamidine , Nigericin , LY-294,002 , Latrunculin B , Hydroxyethilhidrazine , Hydrogen peroxide , Hoechst , Harmine , Fenpropimorph , Emetine , Doxycycline , Cyclopiazonic acid , Clomiphene , Cisplatin , Chlorpromazine , Cerulenin , Calcium ionophore , Anisomycin , Amphotericin , Amiodarone , Alamethicin , Actinomycin , Abietic acid , Wortmannin , Sta

In [26]:
original_data3.set_index('orf', inplace=True)
original_data3 = original_data3[['13Cmpd','15Cmpd']].astype(float)

In [27]:
original_data3 = original_data3.groupby(original_data3.index).mean()

In [28]:
print(original_data3.shape)

(3425, 2)


# Merge

In [34]:
original_data = pd.concat([original_data1,original_data2,original_data3], axis=1)

In [36]:
# Flipping the sign on all because fitness scores are originally reported on the UNT/TRT scale
original_data = -original_data

In [38]:
original_data.index.name='orf'

# Prepare the final dataset

In [40]:
data = original_data.copy()

In [41]:
dataset_ids = [508,4982,5007,5008,511,4983]
datasets = datasets.reindex(index=dataset_ids)

In [42]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [43]:
data.head()

dataset_id,508,4982,5007,5008,511,4983
data_type,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
YAL002W,-0.841143,-0.74691,,,,
YAL004W,-1.35205,-0.824693,,,,
YAL005C,-1.02873,-0.114911,,,-0.0,-0.0
YAL007C,-0.990538,0.467629,,,-0.0,-0.0
YAL008W,-1.12459,0.213372,,,-0.0,-0.0


## Subset to the genes currently in SGD

In [44]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [45]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,508,4982,5007,5008,511,4983
Unnamed: 0_level_1,data_type,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2,YAL002W,-0.841143,-0.74691,,,,
1863,YAL004W,-1.35205,-0.824693,,,,
4,YAL005C,-1.02873,-0.114911,,,-0.0,-0.0
5,YAL007C,-0.990538,0.467629,,,-0.0,-0.0
6,YAL008W,-1.12459,0.213372,,,-0.0,-0.0


# Normalize

In [46]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [47]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [48]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,508,4982,5007,5008,511,4983,508,4982,5007,5008,511,4983
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2,YAL002W,-0.841143,-0.74691,,,,,-0.704143,-0.766043,,,,
1863,YAL004W,-1.35205,-0.824693,,,,,-1.021994,-0.824643,,,,
4,YAL005C,-1.02873,-0.114911,,,-0.0,-0.0,-0.820846,-0.289905,,,0.0,0.0
5,YAL007C,-0.990538,0.467629,,,-0.0,-0.0,-0.797086,0.148972,,,0.0,0.0
6,YAL008W,-1.12459,0.213372,,,-0.0,-0.0,-0.880484,-0.042581,,,0.0,0.0


# Print out

In [49]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [50]:
from IO.save_data_to_db3 import *

In [51]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/6 [00:00<?, ?it/s]

Deleting all datasets for PMID 19043571...
Inserting the new data...


100%|██████████| 6/6 [00:45<00:00,  7.61s/it]

Updating the data_modified_on field...



