In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 24766815
paper_name = 'kemmeren_holstege_2014' 

In [125]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [126]:
datasets.set_index('dataset_id', inplace=True)

In [128]:
datasets['orf'] = datasets['name'].apply(lambda x: x[x.find("(")+1:x.find(")")])

In [131]:
datasets.shape

(6112, 2)

In [129]:
datasets.head()

Unnamed: 0_level_0,name,orf
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5658,hap alpha | expression level of a particular g...,Q0010
5659,hap alpha | expression level of a particular g...,Q0017
5660,hap alpha | expression level of a particular g...,Q0032
5661,hap alpha | expression level of a particular g...,Q0045
5662,hap alpha | expression level of a particular g...,Q0060


# Load & process the data

In [97]:
original_data = pd.read_csv('raw_data/deleteome_all_mutants_ex_wt_var_controls.txt', sep='\t', low_memory=False)

In [98]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 6124 x 4464


In [99]:
original_data = original_data.T

In [100]:
original_data.columns = original_data.loc['systematicName',:].astype(str)

In [101]:
original_data.drop(index=['reporterId','systematicName','geneSymbol'], inplace=True)

In [102]:
data_rows = original_data.index[original_data.iloc[:,0]=='M']

In [103]:
original_data = original_data.loc[data_rows,:]

In [104]:
original_data.drop(columns=['nan'], inplace=True)

In [105]:
original_data.head()

systematicName,Q0010,Q0017,Q0032,Q0045,Q0060,Q0085,Q0092,Q0105,Q0110,Q0115,...,YKL138C-A,YLR154W-C,YML009W-B,YML099W-A,YNL067W-A,YNL162W-A,YNR001W-A,YNR034W-A,YPL096C-A,YBR191W-A
swd1-del-matA vs. wt-matA,0.0076834948,-0.093275612,-0.020262902,-0.80578684,0.02238912,-0.12459737,0.019516896,-0.1414858,-0.2639511,0.03122579,...,-0.66908599,-0.42403134,0.088521561,-0.16640066,0.090528279,0.21356503,-0.027404387,-0.12522821,-0.011412011,-0.38907132
ptc1-del vs. wt,-0.0023376161,-0.00078359617,0.020754644,-0.29111708,-2.0106226,-0.40110698,-0.11458026,-0.057631883,-0.42901061,-0.29239393,...,-0.21893074,-0.21768538,-0.450388,-0.088182655,-0.1022221,-0.20338397,-0.044086137,1.1668568,-0.17993625,-0.41584563
tlc1-del vs. wt,-0.054759384,-0.084442387,-0.011870142,0.082048528,0.20960309,0.025128043,0.0089887832,-0.017701824,-0.18066862,-0.17931534,...,-0.12956662,0.20709321,-0.032282461,0.23421925,-0.10988955,-0.35720418,-0.03353114,2.4556073,0.19387265,0.097920875
rad16-del vs. wt,-0.013982332,0.0072385663,-0.19054769,-0.39000114,-0.70753627,-0.4414992,-0.048980742,-0.046229673,-0.34653819,0.10791547,...,0.12640465,-0.38975351,-0.14707749,0.022019749,0.037241636,0.24044867,-0.049422595,-0.13252257,0.16191347,0.060711493
msn2-del vs. wt,-0.0069850997,-0.035595453,-0.039861986,0.091205595,0.28766141,0.020527092,0.0084438613,-0.012791207,0.16923234,0.12353377,...,-0.049671943,0.29364122,0.064841474,-0.14792603,0.0039331911,-0.028744508,0.037283813,-0.50302642,0.0090891411,-0.0023722973


In [106]:
# Extract deletion strain names
original_data['genes'] = [x.split('-del')[0] for x in original_data.index.values]

In [107]:
# Eliminate all white spaces & capitalize
original_data['genes'] = clean_genename(original_data['genes'])

In [108]:
manual_fixes = {'LUG1': 'YLR352W','CYCC': 'YNL025C'}
original_data['genes'] = original_data['genes'].apply(lambda x: manual_fixes[x] if x in manual_fixes.keys() else x)

In [109]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['genes'], to='orf')

In [110]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

systematicName             Q0010         Q0017          Q0032         Q0045  \
index_input                                                                   
tlc1-del vs. wt     -0.054759384  -0.084442387   -0.011870142   0.082048528   
sdc25-del vs. wt    -0.058805026  -0.041727464   -0.041660639     0.2051047   
hsn1-del vs. wt      0.021024816  -0.015581268  -0.0072895682   -0.12675376   
wt-matA vs wt.1     -0.017202417  -0.011236069   -0.012401174  -0.010878436   
wt-by4743 vs. wt.1  0.0083864892   0.010049357    0.013855263   -0.21042431   
wt-ypd vs. wt.1      -0.02910071   -0.01279994   -0.007499834   -0.16079984   

systematicName             Q0060         Q0085         Q0092         Q0105  \
index_input                                                                  
tlc1-del vs. wt       0.20960309   0.025128043  0.0089887832  -0.017701824   
sdc25-del vs. wt      0.12562465    0.10719238     0.0497144  -0.043498709   
hsn1-del vs. wt       -0.5416348   -0.10568278   -0.028

In [111]:
original_data = original_data.loc[t,:]

In [112]:
original_data.set_index('orf', inplace=True)
original_data.drop(columns=['genes'], inplace=True)

In [113]:
original_data = original_data.astype(float)

In [114]:
original_data = original_data.groupby(original_data.index).mean()

In [115]:
original_data = original_data.T
original_data = original_data.groupby(original_data.index).mean()
original_data = original_data.T

In [116]:
original_data.shape

(1480, 6112)

In [117]:
original_data.head()

systematicName,Q0010,Q0017,Q0032,Q0045,Q0060,Q0085,Q0092,Q0105,Q0110,Q0115,...,YPR193C,YPR194C,YPR195C,YPR196W,YPR197C,YPR198W,YPR199C,YPR200C,YPR201W,snR10
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YAL002W,0.007138,0.018591,0.058884,-0.410887,-2.123057,-0.458983,0.209915,-0.061517,-0.358149,-0.129807,...,0.106281,-0.362751,-0.019175,0.07567,-0.102656,0.239044,-0.051489,0.146251,0.091097,-0.376283
YAL009W,0.010493,0.026106,0.000793,-0.0379,-0.563301,-0.0383,-0.0545,0.094695,-0.1019,-0.0097,...,-0.1335,-0.126399,0.030195,-0.039399,0.026899,0.029107,0.014598,0.008501,-0.0918,0.460596
YAL011W,0.002983,-0.013205,-0.032533,0.150976,0.058067,0.131287,0.00389,0.024107,-0.00662,0.100009,...,-0.103237,0.43973,0.084214,0.172475,0.021039,-0.10219,0.013112,-0.171702,-0.112531,0.428689
YAL013W,0.040416,-0.000688,0.022872,0.132445,-0.159837,0.072435,0.006535,0.074437,0.04938,0.000635,...,0.259049,-0.198424,-0.012381,0.031423,-0.037473,-0.163698,-0.357487,-0.013937,0.038956,-0.088604
YAL015C,-0.004236,-0.06613,-0.000753,0.022545,0.111606,-0.043354,-0.039522,-0.034593,0.012554,0.249906,...,-0.005632,-0.021317,-0.049088,0.052764,0.048864,0.128399,-0.043583,0.026163,0.061638,-0.00532


# Prepare the final dataset

In [141]:
data = original_data.copy()

In [142]:
datasets = datasets.reset_index().set_index('orf')
dataset_ids = datasets.reindex(index=original_data.columns.values)['dataset_id'].values

In [143]:
datasets = datasets.reset_index().set_index('dataset_id')

In [144]:
datasets = datasets.reindex(index=dataset_ids)

In [145]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [146]:
data.head()

dataset_id,5658,5659,5660,5661,5662,5663,5664,5665,5666,5667,...,11750,11751,11752,11753,11754,11755,11756,11757,11758,7662
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL002W,0.007138,0.018591,0.058884,-0.410887,-2.123057,-0.458983,0.209915,-0.061517,-0.358149,-0.129807,...,0.106281,-0.362751,-0.019175,0.07567,-0.102656,0.239044,-0.051489,0.146251,0.091097,-0.376283
YAL009W,0.010493,0.026106,0.000793,-0.0379,-0.563301,-0.0383,-0.0545,0.094695,-0.1019,-0.0097,...,-0.1335,-0.126399,0.030195,-0.039399,0.026899,0.029107,0.014598,0.008501,-0.0918,0.460596
YAL011W,0.002983,-0.013205,-0.032533,0.150976,0.058067,0.131287,0.00389,0.024107,-0.00662,0.100009,...,-0.103237,0.43973,0.084214,0.172475,0.021039,-0.10219,0.013112,-0.171702,-0.112531,0.428689
YAL013W,0.040416,-0.000688,0.022872,0.132445,-0.159837,0.072435,0.006535,0.074437,0.04938,0.000635,...,0.259049,-0.198424,-0.012381,0.031423,-0.037473,-0.163698,-0.357487,-0.013937,0.038956,-0.088604
YAL015C,-0.004236,-0.06613,-0.000753,0.022545,0.111606,-0.043354,-0.039522,-0.034593,0.012554,0.249906,...,-0.005632,-0.021317,-0.049088,0.052764,0.048864,0.128399,-0.043583,0.026163,0.061638,-0.00532


## Subset to the genes currently in SGD

In [147]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [148]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,5658,5659,5660,5661,5662,5663,5664,5665,5666,5667,...,11750,11751,11752,11753,11754,11755,11756,11757,11758,7662
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,0.007138,0.018591,0.058884,-0.410887,-2.123057,-0.458983,0.209915,-0.061517,-0.358149,-0.129807,...,0.106281,-0.362751,-0.019175,0.07567,-0.102656,0.239044,-0.051489,0.146251,0.091097,-0.376283
7,YAL009W,0.010493,0.026106,0.000793,-0.0379,-0.563301,-0.0383,-0.0545,0.094695,-0.1019,-0.0097,...,-0.1335,-0.126399,0.030195,-0.039399,0.026899,0.029107,0.014598,0.008501,-0.0918,0.460596
9,YAL011W,0.002983,-0.013205,-0.032533,0.150976,0.058067,0.131287,0.00389,0.024107,-0.00662,0.100009,...,-0.103237,0.43973,0.084214,0.172475,0.021039,-0.10219,0.013112,-0.171702,-0.112531,0.428689
11,YAL013W,0.040416,-0.000688,0.022872,0.132445,-0.159837,0.072435,0.006535,0.074437,0.04938,0.000635,...,0.259049,-0.198424,-0.012381,0.031423,-0.037473,-0.163698,-0.357487,-0.013937,0.038956,-0.088604
13,YAL015C,-0.004236,-0.06613,-0.000753,0.022545,0.111606,-0.043354,-0.039522,-0.034593,0.012554,0.249906,...,-0.005632,-0.021317,-0.049088,0.052764,0.048864,0.128399,-0.043583,0.026163,0.061638,-0.00532


# Normalize

In [152]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [153]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [154]:
data_vals = data.values
data_norm_vals = data_norm.values
data_norm_vals[np.isnan(data_vals)] = np.nan

In [155]:
data_norm = pd.DataFrame(index=data_norm.index, columns=data_norm.columns, data=data_norm_vals)

In [156]:
data_all = data.join(data_norm)
data_all.head()

Unnamed: 0_level_0,dataset_id,5658,5659,5660,5661,5662,5663,5664,5665,5666,5667,...,11750,11751,11752,11753,11754,11755,11756,11757,11758,7662
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,0.007138,0.018591,0.058884,-0.410887,-2.123057,-0.458983,0.209915,-0.061517,-0.358149,-0.129807,...,0.730637,-1.472248,-0.210619,0.576925,-1.103797,1.737064,-0.475347,1.479204,0.96691,-1.06011
7,YAL009W,0.010493,0.026106,0.000793,-0.0379,-0.563301,-0.0383,-0.0545,0.094695,-0.1019,-0.0097,...,-0.920991,-0.559009,0.212193,-0.173279,0.336138,0.145925,0.116502,0.006472,-0.915274,1.368769
9,YAL011W,0.002983,-0.013205,-0.032533,0.150976,0.058067,0.131287,0.00389,0.024107,-0.00662,0.100009,...,-0.712535,1.628451,0.674824,1.208057,0.271005,-0.849185,0.103193,-1.920137,-1.128609,1.276166
11,YAL013W,0.040416,-0.000688,0.022872,0.132445,-0.159837,0.072435,0.006535,0.074437,0.04938,0.000635,...,1.782912,-0.837305,-0.152434,0.288458,-0.379327,-1.315363,-3.215746,-0.233424,0.430328,-0.225179
13,YAL015C,-0.004236,-0.06613,-0.000753,0.022545,0.111606,-0.043354,-0.039522,-0.034593,0.012554,0.249906,...,-0.040226,-0.152981,-0.466804,0.427588,0.580264,0.898475,-0.404549,0.195297,0.663748,0.016537


# Print out

In [157]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [162]:
# from IO.save_data_to_db3 import *

In [163]:
# save_data_to_db(data_all, paper_pmid)

In [170]:
data_long = pd.melt(data.droplevel('orf', axis=0).reset_index(), id_vars=['gene_id'], col_level='dataset_id')

In [172]:
data_norm_long = pd.melt(data_norm.droplevel('orf', axis=0).reset_index(), id_vars=['gene_id'], col_level='dataset_id')

In [174]:
data_long = data_long.merge(data_norm_long, on=['gene_id','dataset_id'])

In [176]:
data_long.columns = ['gene_id','dataset_id','value','valuez']

In [177]:
data_long.to_csv('kemmeren_holstege_2014_long.txt', sep='\t', index=False)