In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 14718172
paper_name = 'lum_shoemaker_2004' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [25]:
original_data = pd.read_excel('raw_data/mmc2.xlsx', sheet_name='P values')

In [26]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 3503 x 80


In [27]:
original_data.head()

Unnamed: 0,Systematic Name,Gene Symbol,Actinomycin D,Ara-CMP,Imipramine,Trifluoperazine,Caffeine,Doxorubicin,Flurbiprofen,Tunicamycin,...,Kanamycin,Clomipramine,Nalidixic acid,Valproic Acid,Phenylbutazone,Pramoxine,Fenpropimorph,Dyclonine,Lovastatin,Paromomycin
0,YGR105W,VMA21,-138.658,-5.30215,-3.27329,-2.33208,-1.62716,-1.62401,-1.52268,-1.0933,...,,,,,,,,,,
1,YOR114W,,-115.568,,,,,,-0.225293,,...,,,-1.00966,,,,,,,
2,YDR243C,PRP28,-95.2332,-15.9049,-2.96666,-2.23916,-0.519921,-2.5017,-1.39465,-0.76432,...,,-1.25009,,,,,,,,
3,YIR026C,YVH1,-86.2737,,,,,-8.39742,-0.770625,,...,,-0.01007,,-0.142065,,,,,,
4,YHR120W,MSH1,-82.8106,-7.0973,-4.40422,-1.94021,-1.3915,,-0.131297,,...,-0.374533,-1.48067,,-0.407923,,-0.823996,-0.223001,-0.064498,,


In [28]:
original_data['orf'] = original_data['Systematic Name'].astype(str)

In [29]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [30]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [31]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [Systematic Name, Gene Symbol, Actinomycin D, Ara-CMP, Imipramine, Trifluoperazine, Caffeine, Doxorubicin, Flurbiprofen, Tunicamycin, Ethanol, Erythromycin Ethylsuccinate, Sulfinpyrazone, Pentamidine, 5-FC, Cycloheximide, Promethazine, Histamine, Procaine, Tetracycline, Staurosporine, Hydroxyurea, Warfarin, 5-FU, Gliotoxin, Asulam, Carbendazim, Atenolol, Cymoxanil, Isoniazid, Nicardipine, Camptothecin, Sodium Chloride, Tamoxifen, Ibuprofen, Nitrofurantoin, Daunorubicin, FUDR, Sodium m-Arsenite, AZT, Cisplatin, Amphotericin B, Chlorpromazine, Desipramine, Molsidomine, Gemfibrozil, Clotrimazole, DMSO, Metoclopramide, MMS, Sulfometuron Methyl, Amitriptyline, Naproxen, Benoxinate, Troglitazone, Indomethacin, Efavirenz, Thalidomide, Sulfanilamide, Nevirapine, Terbinafine, Sodium Nitroprusside, Methotrexate, Doxycycline, Diltiazem, Haloperidol, Nifedipine, Omeprazole, Menadione, Sulfamethoxazole, Kanamycin, Clomipramine, Nalidixic acid, Valproic Acid, Phenylbutazone,

In [32]:
original_data.set_index('orf', inplace=True)

In [33]:
original_data.drop(columns=['Gene Symbol','Systematic Name'], inplace=True)

In [34]:
original_data = original_data.apply(pd.to_numeric, axis=1, errors='coerce')

In [35]:
original_data = original_data.groupby(original_data.index).mean()

In [36]:
original_data.shape

(3495, 78)

In [50]:
# The entry 'NaN' indicates that no significant growth defect was observed or that data is not available for a particular strain in a particular condition.
# Given the number of NaNs, it is likely that most of them are "no significant growth defect observed", so decided to switch them to 0.
original_data[original_data.isnull()] = 0

# Load dataset_ids

In [37]:
dt = pd.read_csv('extras/dataset_ids.txt', sep='\t', header=None)

In [38]:
dt.head()

Unnamed: 0,0,1
0,556,FUDR
1,4845,5-FU
2,4846,Actinomycin D
3,4847,Amitriptyline
4,4848,Amphotericin B


In [39]:
dt.set_index(1, inplace=True)

In [40]:
dt = dt.reindex(index=original_data.columns.values)

In [41]:
dt.head()

Unnamed: 0_level_0,0
1,Unnamed: 1_level_1
Actinomycin D,4846
Ara-CMP,4849
Imipramine,4881
Trifluoperazine,4916
Caffeine,4854


In [42]:
dataset_ids = dt[0].values

# Prepare the final dataset

In [51]:
data = original_data.copy()

In [52]:
datasets = datasets.reindex(index=dataset_ids)

In [53]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [54]:
data.head()

dataset_id,4846,4849,4881,4916,4854,4867,4874,4918,4872,4871,...,4884,4859,4891,4919,4900,4901,4873,4869,4885,4898
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL001C,-0.526761,0.0,-0.107738,0.0,0.0,-0.692119,-1.5813,0.0,-0.137755,-0.106028,...,0.0,0.0,-0.1289,0.0,0.0,0.0,0.0,0.0,-0.034398,0.0
YAL002W,0.0,0.0,-0.47894,-0.202366,-1.00237,-0.145069,-0.080305,0.0,0.0,0.0,...,0.0,0.0,-0.772808,-0.317837,-0.334747,0.0,0.0,0.0,-5.2855,0.0
YAL003W,0.0,0.0,0.0,-0.112383,-0.130944,-0.232036,-1.09916,-0.837704,0.0,0.0,...,-0.171804,-0.152761,0.0,-0.245453,-0.208113,0.0,0.0,0.0,0.0,0.0
YAL004W,0.0,-2.30015,-0.871762,-0.193739,-0.934458,-1.16136,-0.059543,-0.486263,0.0,-1.78434,...,0.0,0.0,0.0,0.0,0.0,-1.1281,-1.4663,-0.696199,-0.230903,0.0
YAL005C,-0.608465,0.0,0.0,0.0,-1.57608,0.0,0.0,-0.175497,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.253,0.0


## Subset to the genes currently in SGD

In [55]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 8


In [56]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,4846,4849,4881,4916,4854,4867,4874,4918,4872,4871,...,4884,4859,4891,4919,4900,4901,4873,4869,4885,4898
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,-0.526761,0.0,-0.107738,0.0,0.0,-0.692119,-1.5813,0.0,-0.137755,-0.106028,...,0.0,0.0,-0.1289,0.0,0.0,0.0,0.0,0.0,-0.034398,0.0
2,YAL002W,0.0,0.0,-0.47894,-0.202366,-1.00237,-0.145069,-0.080305,0.0,0.0,0.0,...,0.0,0.0,-0.772808,-0.317837,-0.334747,0.0,0.0,0.0,-5.2855,0.0
3,YAL003W,0.0,0.0,0.0,-0.112383,-0.130944,-0.232036,-1.09916,-0.837704,0.0,0.0,...,-0.171804,-0.152761,0.0,-0.245453,-0.208113,0.0,0.0,0.0,0.0,0.0
1863,YAL004W,0.0,-2.30015,-0.871762,-0.193739,-0.934458,-1.16136,-0.059543,-0.486263,0.0,-1.78434,...,0.0,0.0,0.0,0.0,0.0,-1.1281,-1.4663,-0.696199,-0.230903,0.0
4,YAL005C,-0.608465,0.0,0.0,0.0,-1.57608,0.0,0.0,-0.175497,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.253,0.0


# Normalize

In [57]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [58]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [59]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,4846,4849,4881,4916,4854,4867,4874,4918,4872,4871,...,4884,4859,4891,4919,4900,4901,4873,4869,4885,4898
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,-0.526761,0.0,-0.107738,0.0,0.0,-0.692119,-1.5813,0.0,-0.137755,-0.106028,...,0.0,0.0,-0.155262,0.0,0.0,0.0,0.0,0.0,-0.146361,0.0
2,YAL002W,0.0,0.0,-0.47894,-0.202366,-1.00237,-0.145069,-0.080305,0.0,0.0,0.0,...,0.0,0.0,-0.93086,-0.925707,-0.878512,0.0,0.0,0.0,-22.489154,0.0
3,YAL003W,0.0,0.0,0.0,-0.112383,-0.130944,-0.232036,-1.09916,-0.837704,0.0,0.0,...,-0.53793,-0.126842,0.0,-0.714887,-0.546173,0.0,0.0,0.0,0.0,0.0
1863,YAL004W,0.0,-2.30015,-0.871762,-0.193739,-0.934458,-1.16136,-0.059543,-0.486263,0.0,-1.78434,...,0.0,0.0,0.0,0.0,0.0,-2.97039,-2.934221,-1.989875,-0.982464,0.0
4,YAL005C,-0.608465,0.0,0.0,0.0,-1.57608,0.0,0.0,-0.175497,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.076484,0.0


# Print out

In [60]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [61]:
from IO.save_data_to_db3 import *

In [62]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 14718172...


  0%|          | 0/78 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 78/78 [06:16<00:00,  4.83s/it]

Updating the data_modified_on field...



