In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 12496357
paper_name = 'begley_samson_2002' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [72]:
original_data = pd.read_excel('raw_data/ORIG130404_Begley2001raw.xlsx', sheet_name='Sheet1')

In [73]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 9647 x 238


In [74]:
original_data.head()

Unnamed: 0,position on plate,plate number (Research Genetics designations),Gene Name,ORF,Treatment,exp 1 highest sensitive,exp 2 highest sensitive,exp 3 highest sensitive,exp 1 lowest sensitive,exp 2 lowest sensitive,...,Empty,Unnamed: 229,exp1 sensitive,exp2 sensitive,exp3 sensitive,total sensitive,exp1 resistant,exp2 resistant,exp3 resistant,total resistant
0,,,,,,,,,,,...,,,,,,,,,,
1,Data in red indicates that there,,,,,O = no phenotype,,,,,...,"A ""1"" here indicates that this is",,,,,,,,,
2,was no ORF or control at that postion,,,,,1 = sensitive at that dose,,,,,...,"an empty well, with no yeast spotted",,,,,,,,,
3,"on the plate, or that what was",,,,,2 = failed quality control at that dose,,,,,...,,,,,,,,,,
4,spotted in that position did not,,,,,"3, 5 = manually failed quality control at that...",,,,,...,,,,,,,,,,


In [75]:
original_data['orf'] = original_data['ORF'].astype(str)

In [76]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [77]:
typo_fixes = {'YKLO72W':'YKL072W','YOLO57W':'YOL057W','YOLO62C':'YOL062C'}
original_data['orf'] = original_data['orf'].apply(lambda x: typo_fixes[x] if x in typo_fixes.keys() else x)

In [78]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [79]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                 position on plate  \
index_input                                          
0                                              NaN   
1                Data in red indicates that there    
2            was no ORF or control at that postion   
3                  on the plate, or that what was    
4                 spotted in that position did not   
...                                            ...   
9642                                            8H   
9643                                            9H   
9644                                           10H   
9645                                           11H   
9646                                           12H   

            plate number (Research Genetics  designations) Gene Name  ORF  \
index_input                                                                 
0                                                      NaN       NaN  NaN   
1                                                      NaN       N

In [80]:
original_data = original_data.loc[t,:]

In [81]:
original_data['Treatment'].unique()

array(['MMS data', 't-BuOOH data', '4NQO data', 'UV data'], dtype=object)

In [82]:
data_cols = [c for c in original_data.columns if 'control/dose' in c]
data_cols

['exp 1 highest control/dose 1',
 'exp 1 highest control/dose 2',
 'exp 1 highest control/dose 3',
 'exp 1 highest control/dose 4',
 'exp 1 lowest control/dose 1',
 'exp 1 lowest control/dose 2',
 'exp 1 lowest control/dose 3',
 'exp 1 lowest control/dose 4',
 'exp 2 highest control/dose 1',
 'exp 2 highest control/dose 2',
 'exp 2 highest control/dose 3',
 'exp 2 highest control/dose 4',
 'exp 2 lowest control/dose 1',
 'exp 2 lowest control/dose 2',
 'exp 2 \nlowest control/dose 3',
 'exp 2 lowest control/dose 4',
 'exp 3 highest control/dose 1',
 'exp 3 highest control/dose 2',
 'exp 3 highest control/dose 3',
 'exp 3 highest control/dose 4',
 'exp 3 lowest control/dose 1',
 'exp 3 lowest control/dose 2',
 'exp 3  lowest control/dose 3',
 'exp 3 lowest control/dose 4']

In [83]:
original_data.set_index('orf', inplace=True)

In [84]:
original_data = original_data[['Treatment'] + data_cols].copy()

In [85]:
original_data[data_cols] = original_data[data_cols].apply(pd.to_numeric, axis=1, errors='coerce')

In [86]:
original_data[data_cols] = 1 / original_data[data_cols]

In [87]:
original_data.head()

Unnamed: 0_level_0,Treatment,exp 1 highest control/dose 1,exp 1 highest control/dose 2,exp 1 highest control/dose 3,exp 1 highest control/dose 4,exp 1 lowest control/dose 1,exp 1 lowest control/dose 2,exp 1 lowest control/dose 3,exp 1 lowest control/dose 4,exp 2 highest control/dose 1,...,exp 2 \nlowest control/dose 3,exp 2 lowest control/dose 4,exp 3 highest control/dose 1,exp 3 highest control/dose 2,exp 3 highest control/dose 3,exp 3 highest control/dose 4,exp 3 lowest control/dose 1,exp 3 lowest control/dose 2,exp 3 lowest control/dose 3,exp 3 lowest control/dose 4
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YLL001W,MMS data,0.96223,0.765388,0.461301,0.501954,0.939675,0.847121,0.534617,0.47614,0.896331,...,0.499864,0.538637,0.893749,0.988535,0.54185,0.689292,0.918893,1.04051,0.491908,0.752138
YLL002W,MMS data,0.258894,0.363781,0.426277,0.217547,0.252826,0.402628,0.494025,0.206359,0.092899,...,0.036274,0.338241,0.114434,0.105153,0.274697,0.367659,0.117653,0.110681,0.249378,0.40118
YLL005C,MMS data,0.951988,0.864422,0.868063,0.586865,0.929673,0.95673,1.006025,0.556684,0.905423,...,0.60562,1.472785,0.965942,1.070133,1.047486,1.461188,0.993117,1.126398,0.95094,1.594411
YLL006W,MMS data,1.415584,1.431507,1.655011,1.391692,1.382402,1.584372,1.918044,1.320121,0.924438,...,0.695063,1.544927,1.159492,1.155337,1.592441,1.763183,1.192112,1.216082,1.445667,1.92394
YLL009C,MMS data,1.238819,0.858577,0.646676,0.631304,1.209781,0.950262,0.749453,0.598838,0.93602,...,0.650659,1.067021,1.076536,1.17288,1.013236,1.198472,1.106822,1.234548,0.919846,1.307742


In [88]:
for dose in np.arange(1,5):
    cols = [c for c in original_data.columns if 'dose ' + str(dose) in c]
    original_data['dose' + str(dose)] = original_data[cols].mean(axis=1)

In [89]:
treatments = original_data['Treatment'].unique()

In [90]:
original_data_list = []
for t in treatments:
    original_data1 = original_data.loc[original_data['Treatment']==t,['dose1','dose2','dose3','dose4']].copy()
    original_data1 = original_data1.groupby(original_data1.index).mean()
    cols = [t+'_'+c for c in original_data1.columns]
    original_data1.columns=cols
    
    original_data_list.append(original_data1)

In [91]:
original_data = pd.concat(original_data_list, axis=1)

In [92]:
original_data.shape

(1637, 16)

In [93]:
original_data.head()

Unnamed: 0_level_0,MMS data_dose1,MMS data_dose2,MMS data_dose3,MMS data_dose4,t-BuOOH data_dose1,t-BuOOH data_dose2,t-BuOOH data_dose3,t-BuOOH data_dose4,4NQO data_dose1,4NQO data_dose2,4NQO data_dose3,4NQO data_dose4,UV data_dose1,UV data_dose2,UV data_dose3,UV data_dose4
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
YAL002W,0.838585,0.318535,0.19574,0.531143,0.989822,0.929084,0.933097,0.399308,0.895401,0.874471,0.893056,0.78213,0.879667,0.911013,0.928064,0.865901
YAL004W,0.77697,0.509674,0.431777,0.991911,0.983077,0.930386,1.043149,0.904319,0.926982,0.904787,1.051204,1.012846,0.892266,0.903389,0.908228,0.85774
YAL005C,0.795909,0.406662,0.371982,0.86205,0.991444,0.911259,1.102001,0.974523,0.92114,0.95153,1.05504,1.032956,0.883057,0.899703,0.912981,0.841261
YAL007C,0.795636,0.65367,0.586989,1.019509,1.008561,0.989197,1.094017,0.991351,0.946796,0.916957,1.057571,1.019111,0.930758,0.947015,0.943878,0.890667
YAL008W,0.856244,0.696004,0.58283,0.939777,1.028011,0.986614,1.118636,0.982803,0.966736,0.932029,1.026638,0.9829,0.934417,0.943022,0.959441,0.904148


In [94]:
dt = pd.read_csv('extras/phenotype_datasetids.txt', sep='\t', header=None)

In [96]:
dataset_ids = dt[1].values

In [97]:
original_data.shape

(1637, 16)

# Prepare the final dataset

In [98]:
data = original_data.copy()

In [99]:
datasets = datasets.reindex(index=dataset_ids)

In [100]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [101]:
data.head()

dataset_id,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48
data_type,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
YAL002W,0.838585,0.318535,0.19574,0.531143,0.989822,0.929084,0.933097,0.399308,0.895401,0.874471,0.893056,0.78213,0.879667,0.911013,0.928064,0.865901
YAL004W,0.77697,0.509674,0.431777,0.991911,0.983077,0.930386,1.043149,0.904319,0.926982,0.904787,1.051204,1.012846,0.892266,0.903389,0.908228,0.85774
YAL005C,0.795909,0.406662,0.371982,0.86205,0.991444,0.911259,1.102001,0.974523,0.92114,0.95153,1.05504,1.032956,0.883057,0.899703,0.912981,0.841261
YAL007C,0.795636,0.65367,0.586989,1.019509,1.008561,0.989197,1.094017,0.991351,0.946796,0.916957,1.057571,1.019111,0.930758,0.947015,0.943878,0.890667
YAL008W,0.856244,0.696004,0.58283,0.939777,1.028011,0.986614,1.118636,0.982803,0.966736,0.932029,1.026638,0.9829,0.934417,0.943022,0.959441,0.904148


## Subset to the genes currently in SGD

In [102]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 9


In [103]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
2,YAL002W,0.838585,0.318535,0.19574,0.531143,0.989822,0.929084,0.933097,0.399308,0.895401,0.874471,0.893056,0.78213,0.879667,0.911013,0.928064,0.865901
1863,YAL004W,0.77697,0.509674,0.431777,0.991911,0.983077,0.930386,1.043149,0.904319,0.926982,0.904787,1.051204,1.012846,0.892266,0.903389,0.908228,0.85774
4,YAL005C,0.795909,0.406662,0.371982,0.86205,0.991444,0.911259,1.102001,0.974523,0.92114,0.95153,1.05504,1.032956,0.883057,0.899703,0.912981,0.841261
5,YAL007C,0.795636,0.65367,0.586989,1.019509,1.008561,0.989197,1.094017,0.991351,0.946796,0.916957,1.057571,1.019111,0.930758,0.947015,0.943878,0.890667
6,YAL008W,0.856244,0.696004,0.58283,0.939777,1.028011,0.986614,1.118636,0.982803,0.966736,0.932029,1.026638,0.9829,0.934417,0.943022,0.959441,0.904148


# Normalize

In [104]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [105]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [106]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,33,34,35,36,37,38,39,40,41,42,...,39,40,41,42,43,44,45,46,47,48
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,0.838585,0.318535,0.19574,0.531143,0.989822,0.929084,0.933097,0.399308,0.895401,0.874471,...,1.43572,0.68039,1.32118,1.332061,0.462264,1.066172,1.444735,1.506289,1.574373,1.487055
1863,YAL004W,0.77697,0.509674,0.431777,0.991911,0.983077,0.930386,1.043149,0.904319,0.926982,0.904787,...,1.611861,1.468991,1.36687,1.373678,0.589243,1.353928,1.464359,1.493406,1.539,1.473453
4,YAL005C,0.795909,0.406662,0.371982,0.86205,0.991444,0.911259,1.102001,0.974523,0.92114,0.95153,...,1.706056,1.578618,1.358419,1.437843,0.592324,1.379008,1.450015,1.487176,1.547475,1.445986
5,YAL007C,0.795636,0.65367,0.586989,1.019509,1.008561,0.989197,1.094017,0.991351,0.946796,0.916957,...,1.693277,1.604895,1.395538,1.390383,0.594356,1.361741,1.524313,1.567127,1.602574,1.528334
6,YAL008W,0.856244,0.696004,0.58283,0.939777,1.028011,0.986614,1.118636,0.982803,0.966736,0.932029,...,1.73268,1.591547,1.424388,1.411073,0.569519,1.316578,1.530012,1.560379,1.630328,1.550804


# Print out

In [107]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [108]:
from IO.save_data_to_db3 import *

In [109]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/16 [00:00<?, ?it/s]

Deleting all datasets for PMID 12496357...
Inserting the new data...


100%|██████████| 16/16 [00:40<00:00,  2.55s/it]

Updating the data_modified_on field...



