In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 32469861
paper_name = 'liu_liu_2020' 

In [8]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [9]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [24]:
original_data = pd.read_excel('raw_data/journal.pgen.1008798.s007.xlsx', sheet_name='Combined data', skiprows=2)

In [25]:
original_data.columns = ['rank1','orf1','','rank2','orf2','','rank3','orf3','']

In [26]:
original_data.head()

Unnamed: 0,rank1,orf1,Unnamed: 3,rank2,orf2,Unnamed: 6,rank3,orf3,Unnamed: 9
0,1.0,YLR428C,YLR428C,47.0,YPR039W,YPR039W,93.0,YOR008C,SLG1
1,2.0,YML048W-A,YML048W-A,48.0,YLR238W,FAR10,94.0,YOR034C,AKR2
2,3.0,YML102W,CAC2,49.0,YBR085W,AAC3,95.0,YOR041C,YOR041C
3,4.0,YML102C-A,YML102C-A,50.0,YDR269C,YDR269C,96.0,YOR124C,UBP2
4,5.0,YML108W,YML108W,51.0,YDR493W,MZM1,97.0,YOR121C,YOR121C


In [27]:
ranks = pd.concat([original_data['rank1'], original_data['rank2'], original_data['rank3']], axis=0, ignore_index=True)

In [28]:
orfs = pd.concat([original_data['orf1'], original_data['orf2'], original_data['orf3']], axis=0, ignore_index=True)

In [29]:
hit_data = ranks.to_frame().join(orfs.to_frame(), how='outer', lsuffix='_rank', rsuffix='_orf')

In [30]:
hit_data.head()

Unnamed: 0,0_rank,0_orf
0,1.0,YLR428C
1,2.0,YML048W-A
2,3.0,YML102W
3,4.0,YML102C-A
4,5.0,YML108W


In [31]:
hit_data['0_orf'] = hit_data['0_orf'].astype(str)

In [32]:
# Eliminate all white spaces & capitalize
hit_data['0_orf'] = clean_orf(hit_data['0_orf'])

In [33]:
# Translate to ORFs 
hit_data['orfs'] = translate_sc(hit_data['0_orf'], to='orf')

In [34]:
# Make sure everything translated ok
t = looks_like_orf(hit_data['orfs'])
print(hit_data.loc[~t,])

             0_rank 0_orf orfs
index_input                   
46              NaN   ORF  ORF
98              NaN   ORF  ORF
193             NaN   ORF  ORF
245             NaN   ORF  ORF
340             NaN   ORF  ORF
392             NaN   ORF  ORF
419             NaN   NAN  NAN
420             NaN   NAN  NAN
421             NaN   NAN  NAN
422             NaN   NAN  NAN
423             NaN   NAN  NAN
424             NaN   NAN  NAN
425             NaN   NAN  NAN
426             NaN   NAN  NAN
427             NaN   NAN  NAN
428             NaN   NAN  NAN
429             NaN   NAN  NAN
430             NaN   NAN  NAN
431             NaN   NAN  NAN
432             NaN   NAN  NAN
433             NaN   NAN  NAN
434             NaN   NAN  NAN
435             NaN   NAN  NAN
436             NaN   NAN  NAN
437             NaN   NAN  NAN
438             NaN   NAN  NAN
439             NaN   NAN  NAN
440             NaN   NAN  NAN


In [35]:
hit_data = hit_data.loc[t,:]

In [36]:
hit_data.set_index('orfs', inplace=True)
hit_data.index.name='orf'

In [37]:
hit_data['data'] = 1

In [38]:
hit_data = hit_data.groupby(hit_data.index).mean()

In [39]:
hit_data.shape

(413, 2)

# Load & process tested strains

In [40]:
tested_strains = pd.read_excel('raw_data/Original data after SGA Scoring sorted.xls', sheet_name='Combined data')

In [41]:
tested_strains['Array ORF'] = tested_strains['Array ORF'].astype(str)

In [42]:
# Eliminate all white spaces & capitalize
tested_strains['Array ORF'] = clean_orf(tested_strains['Array ORF'])

In [43]:
# Translate to ORFs 
tested_strains['orfs'] = translate_sc(tested_strains['Array ORF'], to='orf')

In [44]:
# Make sure everything translated ok
t = looks_like_orf(tested_strains['orfs'])
print(tested_strains.loc[~t,])

Empty DataFrame
Columns: [Query ORF, Query Name, Array ORF, Array Name, Array annotation, Normalized colony size (EXPERIMENT), Normalized colony std. dev. (EXPERIMENT), Normalized colony size (CONTROL), Normalized colony std. dev. (CONTROL), Score, Score stdev, p-Value, Additional information, orfs]
Index: []


In [45]:
tested = tested_strains['orfs'].unique()

In [46]:
missing = [orf for orf in hit_data.index.values if orf not in tested]

In [47]:
missing

[]

# Prepare the final dataset

In [68]:
dataset_ids = [16543]

In [69]:
datasets = datasets.reindex(index=dataset_ids)

In [70]:
data = pd.DataFrame(index=tested, columns=datasets['name'].values, data=0)

In [71]:
data.loc[hit_data.index, datasets['name'].values[0]] = hit_data['data']

In [72]:
data = data.groupby(data.index).mean()

In [73]:
# Create row index
data.index.name='orf'

In [74]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [75]:
print('Final data dimensions: %d x %d' % (data.shape))

Final data dimensions: 4268 x 1


## Subset to the genes currently in SGD

In [76]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [77]:
data.shape

(4268, 1)

In [78]:
gene_ids.shape

(4268,)

In [79]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

In [80]:
data.head()

Unnamed: 0_level_0,dataset_id,16543
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,0
1863,YAL004W,0
4,YAL005C,0
5,YAL007C,0
6,YAL008W,0


# Normalize

In [81]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [82]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [83]:
data_norm[data.isnull()] = np.nan

In [84]:
data_all = data.join(data_norm)

In [85]:
data_all.head()

Unnamed: 0_level_0,dataset_id,16543,16543
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,0,0.0
1863,YAL004W,0,0.0
4,YAL005C,0,0.0
5,YAL007C,0,0.0
6,YAL008W,0,0.0


# Print out

In [86]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [87]:
from IO.save_data_to_db3 import *

In [89]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 32469861...
Inserting the new data...


100%|██████████| 1/1 [00:06<00:00,  6.59s/it]
