In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 12140549
paper_name = 'giaever_johnston_2002' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

In [5]:
datasets.shape

(22, 1)

# Load & process the data

In [6]:
data_files = [f for f in os.listdir('raw_data/') if f.endswith('.txt') and not f.startswith('ypd') and not f.startswith('Cell')]
len(data_files)

62

In [7]:
original_data_list = []
original_data_experiment_list = []
for ixf, f in enumerate(data_files):
    print(ixf)
    experiment_id = int(f.split('_')[0])
    sign = -1 if f.split('_')[-1] == 'sen.txt' else 1
    original_data = pd.read_csv('raw_data/' + f, header=None, sep='\t')
#     print('Original data dimensions: %d x %d' % (original_data.shape))
#     print(original_data.head())
    original_data['orf'] = original_data[0].astype(str)    
    original_data['orf'] = clean_orf(original_data['orf'])
    original_data['orf'] = translate_sc(original_data['orf'], to='orf')
    t = looks_like_orf(original_data['orf'])
#     print(original_data.loc[~t,])
    
    original_data.set_index('orf', inplace=True)
    original_data['data'] = sign * pd.to_numeric(original_data[1], errors='coerce')
    original_data = original_data[['data']].copy()
    original_data = original_data.groupby(original_data.index).mean()
    
    original_data_list.append(original_data)
    original_data_experiment_list.append(experiment_id)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61


In [8]:
original_data = pd.concat(original_data_list, axis=1)

In [9]:
original_data.columns = original_data_experiment_list

In [10]:
original_data.shape

(4715, 62)

In [11]:
# Load dataset ids
dt = pd.read_excel('raw_data/phenotype_mapping.xlsx', sheet_name='Sheet1')

In [12]:
dt.set_index('Experiment', inplace=True)

In [13]:
dt = dt.reindex(index=original_data_experiment_list)

In [14]:
dataset_ids = dt['Dataset id'].values

In [15]:
dataset_ids

array([720, 714, 723, 710, 710, 715, 719, 712, 711, 720, 718, 472, 722,
       721, 718, 721, 713, 723, 714, 719, 712, 717, 713, 716, 722, 724,
       717, 717, 472, 716, 715, 720, 716, 714, 723, 716, 713, 715, 719,
       710, 710, 712, 718, 472, 722, 720, 718, 721, 721, 711, 714, 711,
       723, 717, 712, 719, 724, 713, 722, 711, 715, 472])

In [16]:
original_data.columns = dataset_ids

In [17]:
original_data = original_data.T
original_data = original_data.groupby(original_data.index).mean()
original_data = original_data.T

In [18]:
original_data.shape

(4715, 16)

In [19]:
original_data.notnull().sum(axis=0)

472    4715
710    4715
711    4715
712    4715
713    4715
714    4715
715    4715
716    4715
717    4715
718    4715
719    4715
720    4715
721    4715
722    4715
723    4715
724    4715
dtype: int64

# Load data (2)

In [20]:
ypd = pd.read_csv('raw_data/ypd.txt', sep='\t')

In [21]:
ypd.head()

Unnamed: 0,ORF,Gene,Average Ratio,Gene Description
0,YNL138W,SRV2,30.6,"70-kDa adenylyl cyclase-associated protein, cy..."
1,YER070W,RNR1,22.3,"ribonucleotide reductase, DNA replication, rib..."
2,YNL054W,VAC7,21.0,"Integral vacuolar membrane protein,molecular_f..."
3,YFL023W,FYV11,20.9,"biological_process unknown, molecular_function..."
4,YBR200W,BEM1,20.3,contains two SH3 domains


In [22]:
ypd['orf'] = ypd['ORF'].astype(str)    
ypd['orf'] = clean_orf(ypd['orf'])
ypd['orf'] = translate_sc(ypd['orf'], to='orf')
t = looks_like_orf(ypd['orf'])
print(ypd.loc[~t,])



Empty DataFrame
Columns: [ORF, Gene, Average Ratio, Gene Description, orf]
Index: []


In [23]:
ypd.set_index('orf', inplace=True)
ypd['data'] = -pd.to_numeric(ypd['Average Ratio'], errors='coerce')
ypd = ypd[['data']].copy()
ypd = ypd.groupby(ypd.index).mean()

In [24]:
ypd.head()

Unnamed: 0_level_0,data
orf,Unnamed: 1_level_1
YAL010C,-3.8
YAL016W,-9.5
YAL021C,-3.1
YAL023C,-6.3
YAL035W,-5.2


In [25]:
ypd.columns = [16187]

In [26]:
ypd.shape

(618, 1)

In [27]:
original_data2 = original_data.join(ypd, how='outer')

In [28]:
original_data2.shape

(4715, 17)

In [30]:
# Set missing YPD values to 0
original_data2.loc[original_data2[16187].isnull(), 16187] = 0

# Load data (3)

In [31]:
morph = pd.read_csv('raw_data/Cell_Morph_Screen_Table.txt', sep='\t')

In [32]:
morph.head()

Unnamed: 0,ORF,strain background,Gene Names,Cell Shape Morphologies
0,---------,------------------,----------------------------------,------------------------------------------------
1,YAL002W,homozygous diploid,"VPS8, FUN15, VPT8",WT
2,YAL004W,homozygous diploid,,WT
3,YAL005C,homozygous diploid,"SSA1, YG100",WT
4,YAL007C,homozygous diploid,ERP2,WT


In [33]:
morph.columns = [c.strip() for c in morph.columns]

In [34]:
morph['orf'] = morph['ORF'].astype(str)    
morph['orf'] = clean_orf(morph['orf'])
morph.loc[morph['orf']=='YELOO1C','orf'] = 'YEL001C'
morph['orf'] = translate_sc(morph['orf'], to='orf')
t = looks_like_orf(morph['orf'])
print(morph.loc[~t,])
morph = morph.loc[t,:]


                   ORF   strain background  \
index_input                                  
0            ---------  ------------------   
3631          YMR41W    homozygous diploid   

                                     Gene Names  \
index_input                                       
0            ----------------------------------   
3631                                              

                                      Cell Shape Morphologies        orf  
index_input                                                               
0            ------------------------------------------------  ---------  
3631         WT                                                   YMR41W  


In [35]:
mp = pd.read_excel('raw_data/phenotype_mapping2.xlsx', sheet_name='Sheet1')
mp.head()

Unnamed: 0,ORIGINAL,WHO,WHAT,WHERE,WHEN,HOW,COEFFICIENT,Dataset id
0,Large,cells,size,,in log phase,visual inspection,1,725
1,Small,cells,size,,in log phase,visual inspection,-1,725
2,Elongate,cells,shape (elongate),,in log phase,visual inspection,1,726
3,Round,cells,shape (round),,in log phase,visual inspection,1,727
4,Football,cells,shape (football),,in log phase,visual inspection,1,729


In [36]:
mp.set_index('ORIGINAL', inplace=True)

In [37]:
for d in mp['Dataset id'].unique():
    morph[d] = 0

In [38]:
for ixr, row in morph.iterrows():
    ps = [x.strip() for x in row['Cell Shape Morphologies'].split(';')]
    
    for p in ps:
        parts = p.split(' ')
        if len(parts) > 1:
            ph = parts[0]
            try:
                score = int(parts[1])
            except ValueError as e:
                next
    
            if ph in mp.index.values:
                morph.loc[ixr, mp.loc[ph,'Dataset id']] = score * mp.loc[ph, 'COEFFICIENT']

In [39]:
morph.set_index('orf', inplace=True)
morph = morph[[725, 726, 727, 729, 728]].copy()

In [40]:
morph = morph.groupby(morph.index).mean()

In [41]:
morph.sum(axis=0)

725     -4.5
726    377.0
727    547.0
729    286.0
728     81.0
dtype: float64

In [42]:
original_data2 = original_data2.join(morph, how='outer')

In [43]:
original_data2.shape

(4759, 22)

In [44]:
original_data2.index.name='orf'

In [45]:
original_data2.columns

Int64Index([  472,   710,   711,   712,   713,   714,   715,   716,   717,
              718,   719,   720,   721,   722,   723,   724, 16187,   725,
              726,   727,   729,   728],
           dtype='int64')

In [46]:
original_data2.notnull().sum(axis=0)

472      4715
710      4715
711      4715
712      4715
713      4715
714      4715
715      4715
716      4715
717      4715
718      4715
719      4715
720      4715
721      4715
722      4715
723      4715
724      4715
16187    4715
725      4726
726      4726
727      4726
729      4726
728      4726
dtype: int64

# Prepare the final dataset

In [47]:
data = original_data2.copy()

In [48]:
dataset_ids = original_data2.columns.values
datasets = datasets.reindex(index=dataset_ids)

In [49]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [50]:
data.head()

dataset_id,472,710,711,712,713,714,715,716,717,718,...,721,722,723,724,16187,725,726,727,729,728
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL002W,-32.692653,5.029054,-16.206715,-7.335546,32.626678,0.699607,5.608615,-1.852476,-1.461604,-3.85965,...,0.13452,0.848066,4.292242,-1.906391,0.0,0.0,0.0,0.0,0.0,0.0
YAL004W,-2.725167,4.471172,-3.277725,-46.122972,-23.488141,-3.093093,-1.130027,2.401893,0.16842,2.882906,...,3.581689,3.421117,5.179082,-1.715363,0.0,0.0,0.0,0.0,0.0,0.0
YAL005C,-1.686238,-1.919843,-2.19142,-0.105042,-2.285902,-0.131624,0.337556,-2.694712,-0.52013,-0.733481,...,0.480521,0.69241,0.073457,1.107479,0.0,0.0,0.0,0.0,0.0,0.0
YAL007C,0.07749,-2.840667,-1.996056,-4.411874,-1.079484,0.187968,0.681174,-2.742676,2.415878,0.053325,...,0.771273,1.262348,-2.487375,2.720904,0.0,0.0,0.0,0.0,0.0,0.0
YAL008W,0.870972,0.990254,-2.301183,-14.160435,-9.156816,-1.37651,0.458605,-14.712105,-5.78151,-8.503072,...,0.551308,0.500345,-1.120764,2.390691,0.0,0.0,0.0,0.0,0.0,0.0


## Subset to the genes currently in SGD

In [51]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 27


In [52]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,472,710,711,712,713,714,715,716,717,718,...,721,722,723,724,16187,725,726,727,729,728
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,-32.692653,5.029054,-16.206715,-7.335546,32.626678,0.699607,5.608615,-1.852476,-1.461604,-3.85965,...,0.13452,0.848066,4.292242,-1.906391,0.0,0.0,0.0,0.0,0.0,0.0
1863,YAL004W,-2.725167,4.471172,-3.277725,-46.122972,-23.488141,-3.093093,-1.130027,2.401893,0.16842,2.882906,...,3.581689,3.421117,5.179082,-1.715363,0.0,0.0,0.0,0.0,0.0,0.0
4,YAL005C,-1.686238,-1.919843,-2.19142,-0.105042,-2.285902,-0.131624,0.337556,-2.694712,-0.52013,-0.733481,...,0.480521,0.69241,0.073457,1.107479,0.0,0.0,0.0,0.0,0.0,0.0
5,YAL007C,0.07749,-2.840667,-1.996056,-4.411874,-1.079484,0.187968,0.681174,-2.742676,2.415878,0.053325,...,0.771273,1.262348,-2.487375,2.720904,0.0,0.0,0.0,0.0,0.0,0.0
6,YAL008W,0.870972,0.990254,-2.301183,-14.160435,-9.156816,-1.37651,0.458605,-14.712105,-5.78151,-8.503072,...,0.551308,0.500345,-1.120764,2.390691,0.0,0.0,0.0,0.0,0.0,0.0


# Normalize

In [53]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [54]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [55]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,472,710,711,712,713,714,715,716,717,718,...,721,722,723,724,16187,725,726,727,729,728
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,-32.692653,5.029054,-16.206715,-7.335546,32.626678,0.699607,5.608615,-1.852476,-1.461604,-3.85965,...,-0.044276,-0.041177,0.320699,0.128168,0.0,-0.059673,0.0,0.0,0.0,0.0
1863,YAL004W,-2.725167,4.471172,-3.277725,-46.122972,-23.488141,-3.093093,-1.130027,2.401893,0.16842,2.882906,...,0.271315,0.163301,0.382314,0.143625,0.0,-0.059673,0.0,0.0,0.0,0.0
4,YAL005C,-1.686238,-1.919843,-2.19142,-0.105042,-2.285902,-0.131624,0.337556,-2.694712,-0.52013,-0.733481,...,-0.0126,-0.053547,0.027588,0.372038,0.0,-0.059673,0.0,0.0,0.0,0.0
5,YAL007C,0.07749,-2.840667,-1.996056,-4.411874,-1.079484,0.187968,0.681174,-2.742676,2.415878,0.053325,...,0.014019,-0.008254,-0.150332,0.502589,0.0,-0.059673,0.0,0.0,0.0,0.0
6,YAL008W,0.870972,0.990254,-2.301183,-14.160435,-9.156816,-1.37651,0.458605,-14.712105,-5.78151,-8.503072,...,-0.006119,-0.06881,-0.055383,0.47587,0.0,-0.059673,0.0,0.0,0.0,0.0


# Print out

In [56]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [57]:
from IO.save_data_to_db3 import *

In [58]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 12140549...


  0%|          | 0/22 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 22/22 [02:43<00:00,  7.43s/it]

Updating the data_modified_on field...



