In [219]:
%run ../../Utils/yp_utils.py

# Initial setup

In [220]:
paper_pmid = 24360837
paper_name = 'hoepfner_movva_2014' 

In [221]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [222]:
datasets.set_index('pmid', inplace=True)

# Load & process the data - Benomyl

In [104]:
original_data1 = pd.read_csv('large_files/raw_data/HOP_scores-benomyl.txt', sep='\t')
original_data2 = pd.read_csv('large_files/raw_data/HIP_scores-benomyl.txt', sep='\t')

In [105]:
print('Original data dimensions: %d x %d' % (original_data1.shape))
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 6681 x 195
Original data dimensions: 6681 x 189


In [106]:
# Keep the sensitivity scores, not z-scores (z-score normalize each strain to its phenotype to all other compounds in the dataset)

In [107]:
cols1 = [c for c in original_data1.columns.values if 'z-score' not in c]
cols2 = [c for c in original_data2.columns.values if 'z-score' not in c]

In [108]:
original_data1 = original_data1.loc[:, cols1]
original_data2 = original_data2.loc[:, cols2]

In [109]:
orf_col = 'Systematic Name'

In [110]:
original_data1[orf_col] = original_data1[orf_col].astype(str)
original_data2[orf_col] = original_data2[orf_col].astype(str)

In [111]:
# Eliminate all white spaces & capitalize
original_data1[orf_col] = clean_orf(original_data1[orf_col])
original_data2[orf_col] = clean_orf(original_data2[orf_col])

In [112]:
# Translate to ORFs 
original_data1['orfs'] = translate_sc(original_data1[orf_col], to='orf')
original_data2['orfs'] = translate_sc(original_data2[orf_col], to='orf')

In [113]:
original_data1.loc[original_data1['orfs'] == 'YBR160WAS','orfs'] = 'YBR160W'
original_data2.loc[original_data2['orfs'] == 'YBR160WAS','orfs'] = 'YBR160W'

In [114]:
# Make sure everything translated ok
t = looks_like_orf(original_data1['orfs'])
# print(original_data1.loc[~t,])

In [115]:
# Make sure everything translated ok
t = looks_like_orf(original_data2['orfs'])
# print(original_data2.loc[~t,])

In [116]:
original_data1 = original_data1.loc[t,:]
original_data2 = original_data2.loc[t,:]

In [117]:
original_data1.set_index('orfs', inplace=True)
original_data2.set_index('orfs', inplace=True)
original_data1.index.name='orf'
original_data2.index.name='orf'

In [118]:
original_data1['data'] = original_data1.mean(axis=1)
original_data2['data'] = original_data2.mean(axis=1)

In [119]:
original_data = original_data1[['data']].join(original_data2[['data']], how='outer', lsuffix='_hop', rsuffix='_hip')

In [120]:
original_data = original_data.groupby(original_data.index).mean()

In [121]:
dataset_ids = [1087, 16622]
data_benomyl = original_data[['data_hop','data_hip']].copy()

In [122]:
data_benomyl.columns = dataset_ids

In [123]:
data_benomyl.head()

Unnamed: 0_level_0,1087,16622
orf,Unnamed: 1_level_1,Unnamed: 2_level_1
Q0010,,
Q0017,,
Q0032,,
Q0045,,
Q0050,,


# Load and process data -- all others

In [232]:
original_data1 = pd.read_csv('large_files/raw_data/HOP_scores.txt', sep='\t')
original_data2 = pd.read_csv('large_files/raw_data/HIP_scores.txt', sep='\t')

In [239]:
original_data1.set_index('Systematic Name', inplace=True)

In [240]:
print('Original data dimensions: %d x %d' % (original_data1.shape))
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 6681 x 5846
Original data dimensions: 6681 x 5913


In [245]:
random_rows = np.random.choice(original_data1.index, 5)
random_cols = np.random.choice(original_data1.columns, 5)
original_data1.loc[random_rows, random_cols]

Unnamed: 0_level_0,Ad. scores for Exp. 696_120_HOP_0120,Ad. scores for Exp. 5455_35_HOP_0178 z-score,Ad. scores for Exp. 2368_100_HOP_0078 z-score,MADL scores for Exp. 189_75_HOP_0120 z-score,Ad. scores for Exp. 2796_1_HOP_0096 z-score
Systematic Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
YER087C-A,-0.278046,0.243508,-0.219257,-2.143087,-1.294666
YKL103C,0.351753,-0.046032,-0.071848,0.187157,0.019149
YER031C,-0.10143,0.010982,0.022837,-0.753961,-0.045918
YKL105C,0.633896,0.072092,-0.06122,-0.098761,0.28769
YJR154W,-0.039013,0.824515,0.005363,-0.02732,-0.117204


In [227]:
cols1 = [c for c in original_data1.columns.values if 'z-score' not in c]
cols2 = [c for c in original_data2.columns.values if 'z-score' not in c]

In [228]:
original_data1 = original_data1.loc[:, cols1]
original_data2 = original_data2.loc[:, cols2]

In [229]:
print('Original data dimensions: %d x %d' % (original_data1.shape))
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 6681 x 2924
Original data dimensions: 6681 x 2957


In [129]:
orf_col = 'Systematic Name'

In [130]:
original_data1[orf_col] = original_data1[orf_col].astype(str)
original_data2[orf_col] = original_data2[orf_col].astype(str)

In [131]:
# Eliminate all white spaces & capitalize
original_data1[orf_col] = clean_orf(original_data1[orf_col])
original_data2[orf_col] = clean_orf(original_data2[orf_col])

In [132]:
# Translate to ORFs 
original_data1['orfs'] = translate_sc(original_data1[orf_col], to='orf')
original_data2['orfs'] = translate_sc(original_data2[orf_col], to='orf')

In [133]:
original_data1.loc[original_data1['orfs'] == 'YBR160WAS','orfs'] = 'YBR160W'
original_data2.loc[original_data2['orfs'] == 'YBR160WAS','orfs'] = 'YBR160W'

In [134]:
# Make sure everything translated ok
t = looks_like_orf(original_data1['orfs'])
# print(original_data1.loc[~t,])

In [135]:
# Make sure everything translated ok
t = looks_like_orf(original_data2['orfs'])
# print(original_data2.loc[~t,])

In [136]:
original_data1 = original_data1.loc[t,:]
original_data2 = original_data2.loc[t,:]

In [137]:
original_data1.set_index('orfs', inplace=True)
original_data2.set_index('orfs', inplace=True)
original_data1.index.name='orf'
original_data2.index.name='orf'

In [138]:
original_data1.drop(columns=['Systematic Name'], inplace=True)

In [139]:
original_data2.drop(columns=['Systematic Name'], inplace=True)

In [140]:
original_data1 = original_data1.groupby(original_data1.index).mean()

In [141]:
original_data2 = original_data2.groupby(original_data2.index).mean()

### Map data columns to dataset_ids

In [142]:
dt = pd.read_csv('extras/datasets_name_to_id.txt', sep='\t')

In [143]:
dt.head()

Unnamed: 0.1,Unnamed: 0,name,cmb,dose,exp,type,new,conditionset_id,dataset
0,0,Ad. scores for Exp. 2_200_HOP_0020A,2,200.0,HOP,Ad.,True,9164.0,16714
1,1,Ad. scores for Exp. 6_173.545_HOP_0090,6,173.545,HOP,Ad.,True,9165.0,16715
2,2,Ad. scores for Exp. 6_200_HOP_0114,6,200.0,HOP,Ad.,True,9166.0,16716
3,3,Ad. scores for Exp. 13_80_HOP_0088,13,80.0,HOP,Ad.,True,9167.0,16717
4,4,Ad. scores for Exp. 19_2.3_HOP_0088,19,2.3,HOP,Ad.,True,9168.0,16718


In [144]:
dt.set_index('name', inplace=True)

In [145]:
dt1 = dt.reindex(index=original_data1.columns.values)

In [146]:
dt2 = dt.reindex(index=original_data2.columns.values)

In [147]:
dt2.head()

Unnamed: 0_level_0,Unnamed: 0,cmb,dose,exp,type,new,conditionset_id,dataset
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ad. scores for Exp. 2_200_HIP_0020A,0,2,200.0,HIP,Ad.,True,9164.0,19243
Ad. scores for Exp. 3_50_HIP_0077,2924,3,50.0,HIP,Ad.,False,,11874
Ad. scores for Exp. 6_173.545_HIP_0089,1,6,173.545,HIP,Ad.,True,9165.0,19244
Ad. scores for Exp. 6_200_HIP_0113,2,6,200.0,HIP,Ad.,True,9166.0,19245
Ad. scores for Exp. 13_80_HIP_0087,3,13,80.0,HIP,Ad.,True,9167.0,19246


In [148]:
original_data1.columns = dt1['dataset'].values
original_data1 = original_data1.T
original_data1 = original_data1.groupby(original_data1.index).mean()
original_data1 = original_data1.T
original_data1.shape

(6620, 2774)

In [149]:
original_data2.columns = dt2['dataset'].values
original_data2 = original_data2.T
original_data2 = original_data2.groupby(original_data2.index).mean()
original_data2 = original_data2.T
original_data2.shape

(6620, 2814)

### Merge

In [150]:
original_data = original_data1.join(original_data2, how='outer')

In [151]:
original_data.shape

(6620, 5588)

In [152]:
original_data_final = data_benomyl.join(original_data, how='outer', lsuffix='_benomyl', rsuffix='_other')

In [153]:
original_data_final.shape

(6620, 5590)

In [154]:
data_benomyl.shape

(6620, 2)

In [155]:
original_data.shape

(6620, 5588)

In [156]:
# Remove ORFs that are all NaNs
num_vals = original_data_final.notnull().sum(axis=1)

In [157]:
original_data_final = original_data_final.loc[num_vals>0,:]

In [158]:
original_data_final.shape

(5867, 5590)

In [159]:
original_data_final.head()

Unnamed: 0_level_0,1087,16622,456,1052,1053,1054,1055,1056,1057,1058,...,21823,21824,21825,21826,21827,21828,21829,21830,21831,21832
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YAL001C,0.153577,-0.039887,0.065251,-1.965097,-0.004813,-0.032265,-0.94114,-0.076135,0.072909,-1.365976,...,0.778166,0.050034,-1.294978,1.094208,-0.079728,-0.66073,-0.169347,0.391134,0.632678,-0.011431
YAL002W,-0.093247,0.261198,-7.26461,-7.892758,0.107382,-10.748092,-0.228372,0.014193,-7.588114,0.001553,...,0.910103,-0.100594,-0.603035,-0.433792,0.31251,1.488788,-1.736656,2.267683,-0.620552,-0.110128
YAL003W,,0.126743,,,,,,,,,...,-1.002932,-0.154591,-0.645912,2.670172,0.04062,0.940628,0.188706,-0.320287,-0.102011,0.127858
YAL004W,0.195749,0.303275,-0.440025,-0.445849,0.054191,0.328757,-0.044282,-0.161686,-0.006767,-0.019377,...,0.186409,2.190569,-0.051758,-1.646874,0.143214,0.497081,-2.862873,-0.394184,-2.654636,-0.060194
YAL005C,0.105597,-1.476444,0.886466,-0.108302,0.081409,-0.796944,-0.10844,-0.015199,0.045282,0.140557,...,-0.144813,-0.23878,-2.591392,-1.666898,1.27434,-0.454355,-1.53571,0.053646,-0.077133,-9.861199


# Prepare final dataset

In [160]:
data = original_data_final.copy()

In [161]:
dataset_ids = original_data_final.columns.values
datasets = datasets.reindex(index=dataset_ids)

In [162]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [163]:
data.head()

dataset_id,1087,16622,456,1052,1053,1054,1055,1056,1057,1058,...,21823,21824,21825,21826,21827,21828,21829,21830,21831,21832
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL001C,0.153577,-0.039887,0.065251,-1.965097,-0.004813,-0.032265,-0.94114,-0.076135,0.072909,-1.365976,...,0.778166,0.050034,-1.294978,1.094208,-0.079728,-0.66073,-0.169347,0.391134,0.632678,-0.011431
YAL002W,-0.093247,0.261198,-7.26461,-7.892758,0.107382,-10.748092,-0.228372,0.014193,-7.588114,0.001553,...,0.910103,-0.100594,-0.603035,-0.433792,0.31251,1.488788,-1.736656,2.267683,-0.620552,-0.110128
YAL003W,,0.126743,,,,,,,,,...,-1.002932,-0.154591,-0.645912,2.670172,0.04062,0.940628,0.188706,-0.320287,-0.102011,0.127858
YAL004W,0.195749,0.303275,-0.440025,-0.445849,0.054191,0.328757,-0.044282,-0.161686,-0.006767,-0.019377,...,0.186409,2.190569,-0.051758,-1.646874,0.143214,0.497081,-2.862873,-0.394184,-2.654636,-0.060194
YAL005C,0.105597,-1.476444,0.886466,-0.108302,0.081409,-0.796944,-0.10844,-0.015199,0.045282,0.140557,...,-0.144813,-0.23878,-2.591392,-1.666898,1.27434,-0.454355,-1.53571,0.053646,-0.077133,-9.861199


## Subset to the genes currently in SGD

In [164]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [165]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,1087,16622,456,1052,1053,1054,1055,1056,1057,1058,...,21823,21824,21825,21826,21827,21828,21829,21830,21831,21832
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,0.153577,-0.039887,0.065251,-1.965097,-0.004813,-0.032265,-0.94114,-0.076135,0.072909,-1.365976,...,0.778166,0.050034,-1.294978,1.094208,-0.079728,-0.66073,-0.169347,0.391134,0.632678,-0.011431
2,YAL002W,-0.093247,0.261198,-7.26461,-7.892758,0.107382,-10.748092,-0.228372,0.014193,-7.588114,0.001553,...,0.910103,-0.100594,-0.603035,-0.433792,0.31251,1.488788,-1.736656,2.267683,-0.620552,-0.110128
3,YAL003W,,0.126743,,,,,,,,,...,-1.002932,-0.154591,-0.645912,2.670172,0.04062,0.940628,0.188706,-0.320287,-0.102011,0.127858
1863,YAL004W,0.195749,0.303275,-0.440025,-0.445849,0.054191,0.328757,-0.044282,-0.161686,-0.006767,-0.019377,...,0.186409,2.190569,-0.051758,-1.646874,0.143214,0.497081,-2.862873,-0.394184,-2.654636,-0.060194
4,YAL005C,0.105597,-1.476444,0.886466,-0.108302,0.081409,-0.796944,-0.10844,-0.015199,0.045282,0.140557,...,-0.144813,-0.23878,-2.591392,-1.666898,1.27434,-0.454355,-1.53571,0.053646,-0.077133,-9.861199


# Normalize

In [166]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [167]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [168]:
data_vals = data.values
data_norm_vals = data_norm.values

data_norm_vals[np.isnan(data_vals)] = np.nan

data_norm = pd.DataFrame(index=data_norm.index, columns=data_norm.columns, data=data_norm_vals)

In [169]:
data_all = data.join(data_norm)
data_all.head()

Unnamed: 0_level_0,dataset_id,1087,16622,456,1052,1053,1054,1055,1056,1057,1058,...,21823,21824,21825,21826,21827,21828,21829,21830,21831,21832
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,0.153577,-0.039887,0.065251,-1.965097,-0.004813,-0.032265,-0.94114,-0.076135,0.072909,-1.365976,...,0.635389,0.094559,-1.426501,0.795991,0.027872,-0.396927,-0.295679,0.439669,0.713175,0.024946
2,YAL002W,-0.093247,0.261198,-7.26461,-7.892758,0.107382,-10.748092,-0.228372,0.014193,-7.588114,0.001553,...,0.734004,-0.061829,-0.746929,-0.258524,0.260448,0.901057,-2.291429,2.570595,-0.704679,-0.064953
3,YAL003W,,0.126743,,,,,,,,,...,-0.695882,-0.117891,-0.789039,1.883608,0.099232,0.570052,0.160251,-0.36819,-0.118022,0.15182
1863,YAL004W,0.195749,0.303275,-0.440025,-0.445849,0.054191,0.328757,-0.044282,-0.161686,-0.006767,-0.019377,...,0.193083,2.316947,-0.205508,-1.095705,0.160065,0.302216,-3.72551,-0.452105,-3.005959,-0.01947
4,YAL005C,0.105597,-1.476444,0.886466,-0.108302,0.081409,-0.796944,-0.10844,-0.015199,0.045282,0.140557,...,-0.054487,-0.205299,-2.699737,-1.109525,0.830762,-0.272307,-2.035553,0.056431,-0.089876,-8.946863


# Print out

In [71]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [170]:
# from IO.save_data_to_db3 import *

In [171]:
# save_data_to_db(data_all, paper_pmid, delete=True)