In [3]:
%run ../../Utils/yp_utils.py

# Initial setup

In [4]:
paper_pmid = 32548177
paper_name = 'edouarzin_vediyappan_2020' 

In [5]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [6]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/mic-07-146-s02.xls', sheet_name='12_27_11_15_06_18_nQuantile_nq', skiprows=2)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 5802 x 17


In [10]:
original_data.head()

Unnamed: 0,probeid,21_12_2011_T11C4_EP6_2.5,21_12_2011_T11C5_EP9_1.25,21_12_2011_T11C6_EP9_1.875,log2(REF),(2) (Albicanol) EP6_0.05mg/ml,(1) (Drimenol) EP9_0.025mg/ml,(1) ( Drimenol) EP9_0.0375mg/ml,gene,essential_gene,feature_type,GO_process,GO_function,GO_component,notes,Cross_Del_Count,Cross_Dels
0,YBR171W::CHR2_3,1.023,4.589,4.321,10.889,1.023,4.589,4.321,SEC66,no,ORF::Verified,filamentous growth*,protein transporter activity,endoplasmic reticulum membrane*,,0.0,
1,YHR060W::CHR8_2,0.966,3.352,1.055,10.118,0.966,3.352,1.055,VMA22,no,ORF::Verified,protein complex assembly*,unfolded protein binding,endoplasmic reticulum membrane,,0.0,
2,YBR289W::CHR00_16A,1.526,3.234,0.549,9.856,1.526,3.234,0.549,SNF5,no,ORF::Verified,chromatin remodeling*,general RNA polymerase II transcription factor...,SWI/SNF complex*,,0.0,
3,YMR309C::CHR13_5,1.091,2.992,0.354,9.9,1.091,2.992,0.354,NIP1,yes,ORF::Verified,translational initiation,translation initiation factor activity,cytoplasm*,,0.0,
4,YKL119C::CHR11_2,0.375,2.91,2.797,9.622,0.375,2.91,2.797,VPH2,no,ORF::Verified,protein complex assembly*,molecular function unknown,endoplasmic reticulum membrane,,1.0,YKL118W Dubious antisense 92 nt 1-92 of 312 bases


In [11]:
original_data['orf'] = original_data['probeid'].astype(str)

In [12]:
original_data['orf'] = original_data['orf'].apply(lambda x: x.split(':')[0])

In [13]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [14]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [15]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [probeid, 21_12_2011_T11C4_EP6_2.5, 21_12_2011_T11C5_EP9_1.25, 21_12_2011_T11C6_EP9_1.875, log2(REF), (2) (Albicanol) EP6_0.05mg/ml, (1) (Drimenol) EP9_0.025mg/ml, (1) ( Drimenol) EP9_0.0375mg/ml, gene, essential_gene, feature_type, GO_process, GO_function, GO_component, notes, Cross_Del_Count, Cross_Dels, orf]
Index: []


In [16]:
data_cols = ['(2) (Albicanol) EP6_0.05mg/ml','(1) (Drimenol) EP9_0.025mg/ml','(1) ( Drimenol) EP9_0.0375mg/ml']

In [17]:
original_data.set_index('orf', inplace=True)

In [18]:
original_data1 = original_data.loc[original_data['essential_gene']=='no', data_cols].copy()
original_data2 = original_data.loc[original_data['essential_gene']=='yes', data_cols].copy()

In [19]:
original_data1 = original_data1.apply(pd.to_numeric, axis=1, errors='coerce')
original_data2 = original_data2.apply(pd.to_numeric, axis=1, errors='coerce')

In [20]:
original_data1 = original_data1.groupby(original_data1.index).mean()
original_data2 = original_data2.groupby(original_data2.index).mean()

In [21]:
original_data1.shape

(4629, 3)

In [22]:
original_data2.shape

(1098, 3)

In [24]:
original_data = original_data1.join(original_data2, how='outer', lsuffix='_1', rsuffix='_2')

In [25]:
original_data.head()

Unnamed: 0_level_0,(2) (Albicanol) EP6_0.05mg/ml_1,(1) (Drimenol) EP9_0.025mg/ml_1,(1) ( Drimenol) EP9_0.0375mg/ml_1,(2) (Albicanol) EP6_0.05mg/ml_2,(1) (Drimenol) EP9_0.025mg/ml_2,(1) ( Drimenol) EP9_0.0375mg/ml_2
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
YAL001C,,,,0.085,-0.197,0.122
YAL002W,-0.118,-0.201,-0.36,,,
YAL003W,,,,0.015,-0.182,-0.119
YAL004W,0.003,-0.063,-0.271,,,
YAL005C,-0.002,-0.16,-0.173,,,


# Prepare the final dataset

In [26]:
data = original_data.copy()

In [27]:
dataset_ids = [21872, 21865, 21870, 21871, 21866, 21869]
datasets = datasets.reindex(index=dataset_ids)

In [28]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [29]:
data.head()

dataset_id,21872,21865,21870,21871,21866,21869
data_type,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
YAL001C,,,,0.085,-0.197,0.122
YAL002W,-0.118,-0.201,-0.36,,,
YAL003W,,,,0.015,-0.182,-0.119
YAL004W,0.003,-0.063,-0.271,,,
YAL005C,-0.002,-0.16,-0.173,,,


## Subset to the genes currently in SGD

In [30]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [31]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,21872,21865,21870,21871,21866,21869
Unnamed: 0_level_1,data_type,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,YAL001C,,,,0.085,-0.197,0.122
2,YAL002W,-0.118,-0.201,-0.36,,,
3,YAL003W,,,,0.015,-0.182,-0.119
1863,YAL004W,0.003,-0.063,-0.271,,,
4,YAL005C,-0.002,-0.16,-0.173,,,


# Normalize

In [32]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [33]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [34]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,21872,21865,21870,21871,21866,21869,21872,21865,21870,21871,21866,21869
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
1,YAL001C,,,,0.085,-0.197,0.122,,,,0.382497,-0.478398,0.418646
2,YAL002W,-0.118,-0.201,-0.36,,,,-0.36874,-0.732638,-0.418735,,,
3,YAL003W,,,,0.015,-0.182,-0.119,,,,0.126366,-0.42388,-0.295224
1863,YAL004W,0.003,-0.063,-0.271,,,,0.049906,-0.311371,-0.279724,,,
4,YAL005C,-0.002,-0.16,-0.173,,,,0.032607,-0.607479,-0.126657,,,


# Print out

In [35]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [36]:
from IO.save_data_to_db3 import *

In [37]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/6 [00:00<?, ?it/s]

Deleting all datasets for PMID 32548177...
Inserting the new data...


100%|██████████| 6/6 [00:43<00:00,  7.25s/it]

Updating the data_modified_on field...



