In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 16901791
paper_name = 'parsons_boone_2006' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/mmc8.xlsx', sheet_name='Supplementary Table 7', skiprows=2)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4111 x 83


In [10]:
original_data.head()

Unnamed: 0,ORF,Sulfometuron methyl,MMS,Clotrimazole,Benomyl,Plumbagin,Hydroxyurea,Artemisinin,Amantadine hydrochloride,4-Hydroxytamoxifen,...,Cytochalasin A,CG4-Theopalauamide,Caspofungin,Camptothecin,Basiliskamide,192A4-Stichloroside,Papuamide B,Agelasine E,Fluconazole,Geldanamycin
0,YOL023W,0.0334,0.561025,-0.209075,0.64975,0.26125,0.252775,0.5236,-0.04285,0.1566,...,-0.02595,0.226875,-0.25875,-0.4545,0.3404,-0.79425,-0.0792,0.193275,0.250175,-0.301775
1,YGR068C,0.0886,-0.250225,-0.049075,-0.1688,-0.3209,0.16505,0.104325,-0.49075,0.0036,...,-0.16555,0.102475,-0.491225,0.15085,-0.165825,-0.43495,-0.382275,0.01865,0.1371,-0.13895
2,YOR324C,-0.031225,0.382675,-0.1845,0.023675,0.11255,-0.129925,0.2686,-0.146675,-0.1821,...,-0.213175,-0.643625,-0.037625,0.1427,0.403825,-0.831175,-0.139625,0.08035,-0.330575,-0.099125
3,YEL038W,0.234975,-0.190575,-0.206775,0.437125,-0.010175,0.00145,-0.02055,0.379,-0.045725,...,-0.19855,0.3535,0.092125,0.024125,0.331275,-0.223125,-0.2153,-0.16785,0.005375,0.166925
4,YBR053C,0.20735,-0.3209,-0.126925,0.014375,0.01585,-0.276225,-0.307075,-0.230925,0.1408,...,0.259825,0.020025,-0.3327,0.086075,0.3147,-0.2733,-0.0665,0.068525,0.168425,-0.126825


In [11]:
original_data['orf'] = original_data['ORF'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [13]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [14]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [ORF, Sulfometuron methyl , MMS , Clotrimazole , Benomyl , Plumbagin , Hydroxyurea , Artemisinin , Amantadine hydrochloride , 4-Hydroxytamoxifen , Usnic acid , Sodium Azide , Nystatin , Neomycin sulfate , Caffeine , Menthol , Verrucarin , Valinomycin , Trifluoroperazine , Tamoxifen , Raloxifene , Pentamidine , Nigericin , LY-294,002 , Latrunculin B , Hydroxyethilhidrazine , Hydrogen peroxide , Hoechst , Harmine , Haloperidol , Fenpropimorph , Emetine , Dyclonine , Doxycycline , Cyclopiazonic acid , Clomiphene , Cisplatin , Chlorpromazine , Cerulenin , Calcium ionophore , Anisomycin , Amphotericin , Amiodarone , Alamethicin , Actinomycin , Abietic acid , Wortmannin , Staurosporine , Conine , Parthenolide , Radicicol , Mitomycin C , Trichostatin A , FK506 , Brefeldin A , U73122 , Tunicamycin , Thialysine , Rapamycin , Phenylarsine oxide , Phenantroline , Oligomycin , Nocodazole , Hygromycin B , Extract 95-57 , Extract 6592 , Extract 00-89 , Extract 00-303C , Extr

In [15]:
original_data.set_index('orf', inplace=True)

In [16]:
original_data.drop(columns=['ORF'], inplace=True)

In [17]:
original_data = -original_data.apply(pd.to_numeric, axis=1, errors='coerce')

In [18]:
original_data = original_data.groupby(original_data.index).mean()

In [19]:
original_data.shape

(4110, 82)

# Load dataset ids

In [23]:
mp = pd.read_csv('extras/phenotype_dataset.txt', sep='\t', header=None)

In [24]:
mp.head()

Unnamed: 0,0,1
0,192A4-Stichloroside,676
1,4-Hydroxytamoxifen,613
2,Abietic acid,604
3,Actinomycin,614
4,Agelasine E,615


In [25]:
mp.set_index(0, inplace=True)

In [26]:
mp = mp.reindex(original_data.columns.values)

In [28]:
dataset_ids = mp[1].values

# Prepare the final dataset

In [29]:
data = original_data.copy()

In [30]:
datasets = datasets.reindex(index=dataset_ids)

In [31]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [32]:
data.head()

dataset_id,677,659,632,623,670,655,621,617,613,608,...,634,679,627,610,622,676,666,615,649,650
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL005C,0.1564,-0.4518,0.3801,-1.01585,0.28765,-0.2637,-0.383,-0.0321,0.15145,-0.023,...,0.3088,-0.1133,-0.0719,0.024,0.1602,0.45695,-0.2764,-0.3569,0.0957,0.16845
YAL007C,0.0367,0.10465,0.2298,0.09095,-0.00615,-0.0884,-0.3905,-0.2489,0.15705,0.0109,...,-0.1925,0.3001,-0.42645,-0.2363,0.04315,-0.6407,0.25805,0.05405,-0.73465,0.133
YAL008W,-0.0371,-0.28735,-0.09735,-0.2175,0.08915,-0.0202,0.00185,-0.39615,0.1693,0.15375,...,-0.25515,0.30105,-0.21525,-0.0306,-0.7516,-0.0101,0.28185,0.1306,-0.33465,0.1864
YAL010C,0.4116,1.02625,0.2787,-1.1072,0.4507,0.2544,-0.5184,-2.0687,-0.5205,0.6882,...,-0.13005,-1.3765,0.6097,0.66415,-0.4363,-1.4434,-1.0806,-0.21725,0.40295,0.5399
YAL012W,-0.1209,0.164675,0.04165,0.2185,-0.0466,0.01295,-0.405075,0.64905,0.311125,-0.00325,...,-1.112025,-0.78875,-0.0044,0.342,0.068675,-0.799625,0.755075,-0.14715,0.1046,-0.382225


## Subset to the genes currently in SGD

In [33]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 14


In [34]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,677,659,632,623,670,655,621,617,613,608,...,634,679,627,610,622,676,666,615,649,650
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
4,YAL005C,0.1564,-0.4518,0.3801,-1.01585,0.28765,-0.2637,-0.383,-0.0321,0.15145,-0.023,...,0.3088,-0.1133,-0.0719,0.024,0.1602,0.45695,-0.2764,-0.3569,0.0957,0.16845
5,YAL007C,0.0367,0.10465,0.2298,0.09095,-0.00615,-0.0884,-0.3905,-0.2489,0.15705,0.0109,...,-0.1925,0.3001,-0.42645,-0.2363,0.04315,-0.6407,0.25805,0.05405,-0.73465,0.133
6,YAL008W,-0.0371,-0.28735,-0.09735,-0.2175,0.08915,-0.0202,0.00185,-0.39615,0.1693,0.15375,...,-0.25515,0.30105,-0.21525,-0.0306,-0.7516,-0.0101,0.28185,0.1306,-0.33465,0.1864
8,YAL010C,0.4116,1.02625,0.2787,-1.1072,0.4507,0.2544,-0.5184,-2.0687,-0.5205,0.6882,...,-0.13005,-1.3765,0.6097,0.66415,-0.4363,-1.4434,-1.0806,-0.21725,0.40295,0.5399
10,YAL012W,-0.1209,0.164675,0.04165,0.2185,-0.0466,0.01295,-0.405075,0.64905,0.311125,-0.00325,...,-1.112025,-0.78875,-0.0044,0.342,0.068675,-0.799625,0.755075,-0.14715,0.1046,-0.382225


# Normalize

In [35]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [36]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [37]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,677,659,632,623,670,655,621,617,613,608,...,634,679,627,610,622,676,666,615,649,650
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
4,YAL005C,0.1564,-0.4518,0.3801,-1.01585,0.28765,-0.2637,-0.383,-0.0321,0.15145,-0.023,...,0.866856,-0.335959,-0.149372,-0.000288,0.425894,0.700529,-0.551552,-1.355191,0.154105,0.243814
5,YAL007C,0.0367,0.10465,0.2298,0.09095,-0.00615,-0.0884,-0.3905,-0.2489,0.15705,0.0109,...,-0.571408,0.411684,-0.671606,-0.54627,0.135151,-0.959085,0.237283,0.185113,-1.562215,0.172548
6,YAL008W,-0.0371,-0.28735,-0.09735,-0.2175,0.08915,-0.0202,0.00185,-0.39615,0.1693,0.15375,...,-0.751155,0.413402,-0.360519,-0.114812,-1.838948,-0.005636,0.272411,0.472035,-0.735422,0.279898
8,YAL010C,0.4116,1.02625,0.2787,-1.1072,0.4507,0.2544,-0.5184,-2.0687,-0.5205,0.6882,...,-0.392235,-2.620486,0.854591,1.342435,-1.055767,-2.172743,-1.73853,-0.831761,0.789186,0.99054
10,YAL012W,-0.1209,0.164675,0.04165,0.2185,-0.0466,0.01295,-0.405075,0.64905,0.311125,-0.00325,...,-3.209588,-1.557526,-0.049948,0.666721,0.198553,-1.199374,0.970879,-0.569015,0.172501,-0.863209


# Print out

In [38]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [39]:
from IO.save_data_to_db3 import *

In [40]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 16901791...


  0%|          | 0/82 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 82/82 [08:03<00:00,  5.90s/it]

Updating the data_modified_on field...



