In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 37115757
paper_name = 'saeki_moriya_2023' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [26]:
original_data = pd.read_excel('raw_data/Large_Data.xlsx', sheet_name='Fig.5C-I (Z)')

In [27]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4321 x 12


In [28]:
original_data.head()

Unnamed: 0.1,Unnamed: 0,Z_YPD,Z_Na,Z_NaCa,pvalues (Na vs NaCa),pvalues (YPD vs Na),pvalues (YPD vs NaCa),qvalues (Na vs NaCa),qvalues (YPD vs Na),qvalues (YPD vs NaCa),Fig.5C,Fig.5F
0,YAL001C,-0.737584,-1.594022,-1.650956,0.234157,0.122482,0.008652,0.280664,0.183893,0.022852,False,False
1,YAL004W,0.909522,0.331849,0.905389,0.0423,0.004469,0.096406,0.076285,0.023961,0.13275,False,False
2,YAL005C,0.520592,-0.295495,0.176687,0.635032,0.000547,0.013318,0.672872,0.007504,0.030611,False,False
3,YAL008W,0.23982,-8.1e-05,-0.719041,0.005399,0.138748,0.004811,0.032359,0.201184,0.015871,False,False
4,YAL011W,-0.268009,-1.036829,-0.98785,0.030444,0.001502,0.001955,0.064676,0.012557,0.009621,False,False


In [29]:
original_data['orf'] = original_data['Unnamed: 0'].astype(str)

In [30]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [31]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [32]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [Unnamed: 0, Z_YPD, Z_Na, Z_NaCa, pvalues (Na vs NaCa), pvalues (YPD vs Na), pvalues  (YPD vs NaCa), qvalues (Na vs NaCa), qvalues  (YPD vs Na), qvalues  (YPD vs NaCa), Fig.5C, Fig.5F, orf]
Index: []


In [33]:
original_data = original_data[['orf','Z_YPD','Z_Na','Z_NaCa']]

In [34]:
original_data.set_index('orf', inplace=True)

In [35]:
original_data['Z_Na_YPD'] = original_data['Z_Na'] - original_data['Z_YPD']
original_data['Z_NaCa_YPD'] = original_data['Z_NaCa'] - original_data['Z_YPD']

In [36]:
original_data = original_data.groupby(original_data.index).mean()

In [37]:
original_data.shape

(4200, 5)

In [39]:
original_data = original_data[['Z_YPD','Z_Na_YPD','Z_NaCa_YPD']]

# Prepare the final dataset

In [40]:
data = original_data.copy()

In [41]:
dataset_ids = [22249,22250,22251]
datasets = datasets.reindex(index=dataset_ids)

In [42]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [43]:
data.head()

dataset_id,22249,22250,22251
data_type,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
YAL001C,-0.737584,-0.856438,-0.913372
YAL004W,0.909522,-0.577673,-0.004133
YAL005C,0.520592,-0.816086,-0.343904
YAL008W,0.23982,-0.239902,-0.958861
YAL011W,-0.268009,-0.76882,-0.719841


## Subset to the genes currently in SGD

In [44]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 20


In [45]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,22249,22250,22251
Unnamed: 0_level_1,data_type,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,YAL001C,-0.737584,-0.856438,-0.913372
1863,YAL004W,0.909522,-0.577673,-0.004133
4,YAL005C,0.520592,-0.816086,-0.343904
6,YAL008W,0.23982,-0.239902,-0.958861
9,YAL011W,-0.268009,-0.76882,-0.719841


# Normalize

In [46]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [47]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [48]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,22249,22250,22251,22249,22250,22251
Unnamed: 0_level_1,data_type,value,value,value,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,YAL001C,-0.737584,-0.856438,-0.913372,-0.958677,-0.495179,-1.433517
1863,YAL004W,0.909522,-0.577673,-0.004133,0.639026,-0.248104,-0.284936
4,YAL005C,0.520592,-0.816086,-0.343904,0.261761,-0.459415,-0.714146
6,YAL008W,0.23982,-0.239902,-0.958861,-0.010589,0.05127,-1.49098
9,YAL011W,-0.268009,-0.76882,-0.719841,-0.503186,-0.417522,-1.189042


# Print out

In [49]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')