In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 22102822
paper_name = 'berry_gasch_2011' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/pgen.1002353.s009.xlsx', sheet_name='Hom-Het COMPILATION')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 6128 x 94


In [7]:
original_data.head()

Unnamed: 0,YORF,DTT1 Sample1 vs Sample0 Array,DTT1 Sample2 vs Sample1 UP,DTT1 Sample2 vs Sample1 DN,DTT1 Sample2 vs Sample1 Array,DTT1 Sample3 vs Sample1 UP,DTT1 Sample3 vs Sample1 DN,DTT1 Sample3 vs Sample1 Array,DTT1 Sample4 vs Sample3 UP,DTT1 Sample4 vs Sample3 DN,...,TM2 Sample1 vs Sample0 UP,TM2 Sample1 vs Sample0 DN,TM2 Sample2 vs Sample1 UP,TM2 Sample2 vs Sample1 DN,TM2 Sample3 vs Sample1 UP,TM2 Sample3 vs Sample1 DN,TM2 Sample4A vs Sample3 UP,TM2 Sample4A vs Sample3 DN,TM2 Sample4B vs Sample3 UP,TM2 Sample4B vs Sample3 DN
0,YAL001C,-0.05,0.239259,,0.034,-0.013278,,-0.13,0.139765,,...,0.200691,,-0.314959,,-0.237385,,0.097175,,0.195584,
1,YAL002W,,-0.87271,,,0.231187,,,-1.882254,,...,0.516988,,-0.454378,,-1.691624,,-1.820814,,-2.378251,
2,YAL003W,0.592,0.253843,,0.007,0.147177,,-0.154,0.008348,,...,0.178665,,0.652619,,0.479385,,0.108625,,0.54197,
3,YAL004W,0.209,0.741564,,0.007,0.098807,,-0.231,0.261632,,...,0.090227,,0.629178,,0.369172,,0.081651,,0.119594,
4,YAL005C,-0.065,0.728445,,0.045,0.376067,,-0.077,0.265845,,...,0.132495,,0.617387,,0.557833,,0.329289,,0.592264,


In [8]:
original_data['orf'] = original_data['YORF'].astype(str)

In [9]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [10]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [11]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [YORF, DTT1 Sample1 vs Sample0 Array, DTT1 Sample2 vs Sample1 UP, DTT1 Sample2 vs Sample1 DN, DTT1 Sample2 vs Sample1 Array, DTT1 Sample3 vs Sample1 UP, DTT1 Sample3 vs Sample1 DN, DTT1 Sample3 vs Sample1 Array, DTT1 Sample4 vs Sample3 UP, DTT1 Sample4 vs Sample3 DN, DTT1 Sample4A vs Sample3 Array, DTT1 Sample4B vs Sample3 Array, DTT2 Sample1 vs Sample0 Array, DTT2 Sample2 vs Sample1 UP, DTT2 Sample2 vs Sample1 DN, DTT2 Sample2 vs Sample1 Array, DTT2 Sample3 vs Sample1 UP, DTT2 Sample3 vs Sample1 DN, DTT2 Sample3 vs Sample1 Array, DTT2 Sample4 vs Sample3 UP, DTT2 Sample4 vs Sample3 DN, DTT2 Sample4A vs Sample3 Array, DTT2 Sample4B vs Sample3 Array, NaCl1 Sample1 vs Sample0 Array, NaCl1 Sample2 vs Sample1 Array, NaCl1 Sample3 vs Sample1 Array, NaCl1 Sample4A vs Sample3 Array, NaCl2 Sample1 vs Sample0 Array, NaCl2 Sample2 vs Sample1 UP, NaCl2 Sample2 vs Sample1 DN, NaCl2 Sample2 vs Sample1 Array, NaCl2 Sample3 vs Sample1 UP, NaCl2 Sample3 vs Sample1 DN, NaCl2 Sam

In [12]:
original_data.set_index('orf', inplace=True)

In [13]:
original_data.drop(columns=['YORF'], inplace=True)

In [14]:
for c in original_data.columns:
    original_data[c] = pd.to_numeric(original_data[c], errors='coerce')

## Data processing

In [15]:
import re

In [16]:
regex_list = ['Sample1 vs Sample0 Array','Sample2 vs Sample1 Array',
              'DTT[0-9] Sample3 vs Sample1 Array','NaCl[0-9] Sample3 vs Sample1 Array','HS[0-9] Sample3 vs Sample1 Array',
              'DTT[0-9] Sample4[A-Z]? vs Sample3 Array','NaCl[0-9] Sample4[A-Z]? vs Sample3 Array','HS[0-9] Sample4[A-Z]? vs Sample3 Array',
              'Sample1 vs Sample0 [(DN)(UP)]','Sample2 vs Sample1 [(DN)(UP)]','DTT[0-9] Sample3 vs Sample1 [(DN)(UP)]',
              'NaCl[0-9] Sample3 vs Sample1 [(DN)(UP)]','HS[0-9] Sample3 vs Sample1 [(DN)(UP)]','TM[0-9] Sample3 vs Sample1 [(DN)(UP)]',
              'DTT[0-9] Sample4[A-Z]? vs Sample3 [(DN)(UP)]','NaCl[0-9] Sample4[A-Z]? vs Sample3 [(DN)(UP)]',
              'HS[0-9] Sample4[A-Z]? vs Sample3 [(DN)(UP)]','TM[0-9] Sample4[A-Z]? vs Sample3 [(DN)(UP)]']

In [17]:
original_data_list = []
for r in regex_list:
    data_cols = [c for c in original_data.columns if bool(re.search(r, c))]
    t = original_data[data_cols].mean(axis=1)
    original_data_list.append(t)

In [18]:
original_data = pd.concat(original_data_list, axis=1)

In [19]:
original_data = original_data.groupby(original_data.index).mean()

In [20]:
### Only keep Hom strains using current Hom collection from Open Biosystems
hom = pd.read_excel('extras/Homozygous_diploid_obs_v7.0.xlsx', sheet_name='DATA')
hom.head()

Unnamed: 0,Record number,ORF,Batch,Collection,Plate,Row,Col,Comment
0,30916,YHL047C,chr8_1,Hom Dip,1,A,2,
1,30917,YHL046C,chr8_1,Hom Dip,1,A,3,
2,30918,YHL045W,chr8_1,Hom Dip,1,A,4,
3,30919,YHL044W,chr8_1,Hom Dip,1,A,5,
4,30920,YHL043W,chr8_1,Hom Dip,1,A,6,


In [21]:
hom['orf'] = hom['ORF'].astype(str)

In [22]:
hom['orf'] = clean_orf(hom['orf'])

In [23]:
hom['orf'] = translate_sc(hom['orf'], to='orf')

In [24]:
# Make sure everything translated ok
t = looks_like_orf(hom['orf'])
print(hom.loc[~t,])

Empty DataFrame
Columns: [Record number, ORF, Batch, Collection, Plate, Row, Col, Comment, orf]
Index: []


In [25]:
hom_orfs = hom['orf'].unique()

In [26]:
hom_orfs_in_data = [orf for orf in hom_orfs if orf in original_data.index.values]
len(hom_orfs_in_data)

4881

In [27]:
original_data = original_data.reindex(index=hom_orfs_in_data, fill_value=np.nan)

# Prepare the final dataset

In [28]:
data = original_data.copy()

In [29]:
dataset_ids = [758, 759, 761, 760, 762, 590, 588, 589, 5395, 5396, 5397, 5398, 5399, 5400, 5401, 5402, 5403, 5404]
datasets = datasets.reindex(index=dataset_ids)

In [30]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [31]:
data.head()

dataset_id,758,759,761,760,762,590,588,589,5395,5396,5397,5398,5399,5400,5401,5402,5403,5404
data_type,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
YHL047C,0.114142,0.030021,0.053,-0.024211,-0.15,-0.00825,0.031456,0.154,0.603629,0.285873,1.872119,0.204267,-0.238393,-2.630192,0.00139,-0.455216,-0.125437,1.52094
YHL046C,0.068473,0.073602,-0.1105,0.057596,0.0825,0.17875,0.039997,-0.3125,0.41497,0.07208,-0.780107,0.047117,0.01937,0.535437,0.616131,0.34026,-0.010614,0.047467
YHL045W,-0.083522,0.005647,0.1135,-0.058596,-0.062667,0.01375,0.116899,0.136,0.523496,-0.063968,0.056197,-0.204267,0.087238,0.150231,0.22346,0.154379,0.063613,0.267529
YHL044W,0.28935,-0.41909,-0.319,-0.267929,-0.26,-0.4345,-0.080897,-0.312,0.371554,-0.355122,-0.769061,-0.620288,-0.189663,1.936132,0.771208,-0.712168,-0.208608,0.57654
YHL043W,-0.033187,-0.075945,0.012,0.024357,-0.153,0.048,-0.048165,-0.04925,0.399952,0.067354,-0.294231,-0.001626,-0.575725,0.125253,0.721306,-0.055458,0.193834,0.274431


## Subset to the genes currently in SGD

In [32]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 23


In [33]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,758,759,761,760,762,590,588,589,5395,5396,5397,5398,5399,5400,5401,5402,5403,5404
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
964,YHL047C,0.114142,0.030021,0.053,-0.024211,-0.15,-0.00825,0.031456,0.154,0.603629,0.285873,1.872119,0.204267,-0.238393,-2.630192,0.00139,-0.455216,-0.125437,1.52094
963,YHL046C,0.068473,0.073602,-0.1105,0.057596,0.0825,0.17875,0.039997,-0.3125,0.41497,0.07208,-0.780107,0.047117,0.01937,0.535437,0.616131,0.34026,-0.010614,0.047467
962,YHL045W,-0.083522,0.005647,0.1135,-0.058596,-0.062667,0.01375,0.116899,0.136,0.523496,-0.063968,0.056197,-0.204267,0.087238,0.150231,0.22346,0.154379,0.063613,0.267529
961,YHL044W,0.28935,-0.41909,-0.319,-0.267929,-0.26,-0.4345,-0.080897,-0.312,0.371554,-0.355122,-0.769061,-0.620288,-0.189663,1.936132,0.771208,-0.712168,-0.208608,0.57654
960,YHL043W,-0.033187,-0.075945,0.012,0.024357,-0.153,0.048,-0.048165,-0.04925,0.399952,0.067354,-0.294231,-0.001626,-0.575725,0.125253,0.721306,-0.055458,0.193834,0.274431


# Normalize

In [34]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [35]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [36]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,758,759,761,760,762,590,588,589,5395,5396,...,5395,5396,5397,5398,5399,5400,5401,5402,5403,5404
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
964,YHL047C,0.114142,0.030021,0.053,-0.024211,-0.15,-0.00825,0.031456,0.154,0.603629,0.285873,...,0.140718,0.282097,3.958126,0.570707,-0.58274,-2.042343,-0.416363,-1.01774,-0.382026,1.35724
963,YHL046C,0.068473,0.073602,-0.1105,0.057596,0.0825,0.17875,0.039997,-0.3125,0.41497,0.07208,...,-0.025326,-0.098878,-1.391112,0.193088,-0.02287,0.350388,0.594405,0.451304,-0.188633,-0.105572
962,YHL045W,-0.083522,0.005647,0.1135,-0.058596,-0.062667,0.01375,0.116899,0.136,0.523496,-0.063968,...,0.070191,-0.341313,0.295618,-0.410968,0.124543,0.059231,-0.051231,0.108029,-0.063614,0.112897
961,YHL044W,0.28935,-0.41909,-0.319,-0.267929,-0.26,-0.4345,-0.080897,-0.312,0.371554,-0.355122,...,-0.063537,-0.860143,-1.368834,-1.410635,-0.476897,1.409098,0.849385,-1.492265,-0.52211,0.419672
960,YHL043W,-0.033187,-0.075945,0.012,0.024357,-0.153,0.048,-0.048165,-0.04925,0.399952,0.067354,...,-0.038543,-0.107299,-0.411156,0.075963,-1.315437,0.040352,0.767335,-0.279487,0.155717,0.11975


# Print out

In [37]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [38]:
from IO.save_data_to_db3 import *

In [39]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/18 [00:00<?, ?it/s]

Deleting all datasets for PMID 22102822...
Inserting the new data...


100%|██████████| 18/18 [02:18<00:00,  7.70s/it]

Updating the data_modified_on field...



