In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 30016095
paper_name = 'prescott_hoepfner_2018' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/Gossypol HIP-HOP Scores.xlsx', sheet_name='Gossypol HIP-HOP Scores')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 30597 x 12


In [7]:
original_data.head()

Unnamed: 0,COMPOUND,COMPOUND_CONCENTRATION,CONCENTRATION,EXPERIMENT,EXPERIMENT_NUMBER_HIPHOP,EXPERIMENT_TYPE,GENE_NAME,SCORE,SCORE_TYPE,SYSTEMATIC_NAME,VIABILITY,Z_SCORE
0,1029,1029_200,200,1029_200_0062,0061/62,HIP,TFC3,0.898556,Adj.,YAL001C,inviable,1.122255
1,1029,1029_200,200,1029_200_0062,0061/62,HIP,VPS8,0.494657,Adj.,YAL002W,viable,0.278998
2,1029,1029_200,200,1029_200_0062,0061/62,HIP,EFB1,0.101555,Adj.,YAL003W,inviable,0.100563
3,1029,1029_200,200,1029_200_0062,0061/62,HIP,YAL004W,-1.941799,Adj.,YAL004W,viable,-1.046383
4,1029,1029_200,200,1029_200_0062,0061/62,HIP,SSA1,-0.393975,Adj.,YAL005C,viable,-0.046586


In [8]:
original_data['orf'] = original_data['SYSTEMATIC_NAME'].astype(str)

In [9]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [10]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [11]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [COMPOUND, COMPOUND_CONCENTRATION, CONCENTRATION, EXPERIMENT, EXPERIMENT_NUMBER_HIPHOP, EXPERIMENT_TYPE, GENE_NAME, SCORE, SCORE_TYPE, SYSTEMATIC_NAME, VIABILITY, Z_SCORE, orf]
Index: []


In [12]:
original_data['data'] = original_data['Z_SCORE']

In [13]:
# Separate HIP, HOP and different concentrations
data = original_data.groupby(['EXPERIMENT','orf'])['data'].mean().to_frame()

In [14]:
data = data.reset_index()

In [15]:
data2 = pd.pivot_table(data, index='orf', columns='EXPERIMENT', values='data')

In [16]:
data2.head()

EXPERIMENT,1029_200_0061,1029_200_0062,1029_300_0061,1029_300_0062,1029_400_0061,1029_400_0062
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
YAL001C,0.198284,1.122255,0.272643,0.405437,0.164446,0.42558
YAL002W,-0.159038,0.278998,-0.086617,0.610291,0.084285,-0.040042
YAL003W,,0.100563,,1.321273,,0.41278
YAL004W,-0.014224,-1.046383,-0.2815,-0.067075,-0.291025,-0.20004
YAL005C,0.430502,-0.046586,0.276462,-0.139995,0.899274,-0.057484


In [35]:
data2.sort_values(by='1029_400_0061', ascending=True).head(n=10)

EXPERIMENT,1029_200_0061,1029_200_0062,1029_300_0061,1029_300_0062,1029_400_0061,1029_400_0062
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
YER145C,-11.570694,-1.706586,-14.414954,-0.126122,-13.699457,-0.02799
YMR058W,-9.714442,-0.064391,-11.285044,-0.020797,-10.422118,-0.788666
YDR269C,-9.764539,-0.038647,-10.599459,-0.038812,-10.181572,-0.006305
YDR271C,-9.007026,0.007797,-9.158221,0.03993,-8.68783,-0.51913
YDR270W,-8.17793,-1.905951,-9.279917,-1.82979,-8.237444,-0.675227
YJR040W,-5.953909,-0.374985,-6.092526,-0.036112,-6.159876,0.006227
YMR057C,-6.197959,-0.464511,-5.633957,-0.604115,-5.928065,-0.659612
YJR079W,-4.908249,-3.263048,-5.667249,-3.129684,-4.9763,-2.787767
YDR419W,-0.822547,-0.652241,-2.395708,-0.091211,-4.782011,-0.150709
YMR123W,-2.399328,0.16577,-1.331341,0.027795,-4.115391,0.18149


# Prepare the final dataset

In [22]:
data = data2.copy()

In [23]:
dataset_ids = [22255,22256,22237,22253,22254,22252]
datasets = datasets.reindex(index=dataset_ids)

In [24]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [25]:
data.head()

dataset_id,22255,22256,22237,22253,22254,22252
data_type,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
YAL001C,0.198284,1.122255,0.272643,0.405437,0.164446,0.42558
YAL002W,-0.159038,0.278998,-0.086617,0.610291,0.084285,-0.040042
YAL003W,,0.100563,,1.321273,,0.41278
YAL004W,-0.014224,-1.046383,-0.2815,-0.067075,-0.291025,-0.20004
YAL005C,0.430502,-0.046586,0.276462,-0.139995,0.899274,-0.057484


## Subset to the genes currently in SGD

In [26]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 10


In [27]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,22255,22256,22237,22253,22254,22252
Unnamed: 0_level_1,data_type,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,YAL001C,0.198284,1.122255,0.272643,0.405437,0.164446,0.42558
2,YAL002W,-0.159038,0.278998,-0.086617,0.610291,0.084285,-0.040042
3,YAL003W,,0.100563,,1.321273,,0.41278
1863,YAL004W,-0.014224,-1.046383,-0.2815,-0.067075,-0.291025,-0.20004
4,YAL005C,0.430502,-0.046586,0.276462,-0.139995,0.899274,-0.057484


# Normalize

In [28]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [29]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [30]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,22255,22256,22237,22253,22254,22252,22255,22256,22237,22253,22254,22252
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
1,YAL001C,0.198284,1.122255,0.272643,0.405437,0.164446,0.42558,0.18354,1.30724,0.219371,0.445635,0.13935,0.500238
2,YAL002W,-0.159038,0.278998,-0.086617,0.610291,0.084285,-0.040042,-0.229258,0.324556,-0.182665,0.681779,0.053103,-0.057823
3,YAL003W,,0.100563,,1.321273,,0.41278,,0.116618,,1.501362,,0.484896
1863,YAL004W,-0.014224,-1.046383,-0.2815,-0.067075,-0.291025,-0.20004,-0.06196,-1.219968,-0.400752,-0.099052,-0.350703,-0.249584
4,YAL005C,0.430502,-0.046586,0.276462,-0.139995,0.899274,-0.057484,0.451812,-0.054862,0.223644,-0.18311,0.929971,-0.078727


# Print out

In [32]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')