In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 24722918
paper_name = 'cohen_schuldiner_2014' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [10]:
original_data = pd.read_excel('raw_data/c4mb00001c2.xlsx', sheet_name='table', skiprows=1)

In [11]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 56 x 5


In [12]:
original_data.head()

Unnamed: 0,Mutated ORF,Gene Name,Phenotype,Functional group,SGD description
0,YPL063W,TIM50,Abberant peroxisomes,Mitochondrial Functions,Essential component of the Translocase of the ...
1,YAL010C,MDM10,Abberant peroxisomes,Mitochondrial Functions,Subunit of both the ERMES complex and the SAM ...
2,YOL108C,INO4,Abberant peroxisomes,Nuclear Functions,Transcription factor required for derepression...
3,YMR276W,DSK2,Reduced cherry-SKL in peroxisomes,Nuclear Functions,Nuclear-enriched ubiquitin-like polyubiquitin-...
4,YFR001W,LOC1,Reduced cherry-SKL in peroxisomes,Nuclear Functions,Nuclear protein involved in asymmetric localiz...


In [13]:
original_data['orf'] = original_data['Mutated ORF'].astype(str)

In [14]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [15]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [16]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [Mutated ORF, Gene Name, Phenotype, Functional group, SGD description, orf]
Index: []


In [18]:
original_data['Phenotype'] = original_data['Phenotype'].apply(lambda x: x.strip())

In [21]:
original_data['data1'] = 0
original_data['data2'] = 0
original_data['data3'] = 0

In [19]:
original_data['Phenotype'].unique()

array(['Abberant peroxisomes', 'Reduced cherry-SKL in peroxisomes',
       'GFP-Ant1 in mitochondria', 'Reduced/No peroxisomes'], dtype=object)

In [22]:
original_data.loc[original_data['Phenotype']=='Abberant peroxisomes','data1'] = 1
original_data.loc[original_data['Phenotype']=='Reduced cherry-SKL in peroxisomes','data3'] = -1
original_data.loc[original_data['Phenotype']=='Reduced/No peroxisomes','data2'] = -1

In [23]:
original_data.set_index('orf', inplace=True)

In [24]:
original_data = original_data[['data1','data2','data3']].copy()

In [25]:
original_data = original_data.groupby(original_data.index).mean()

In [26]:
original_data.shape

(56, 3)

# Load & process tested strains

In [27]:
tested = pd.read_excel('raw_data/KO_DAmP_ORFs.xlsx', sheet_name='Sheet1', skiprows=1)

In [28]:
tested.head()

Unnamed: 0,ORF,Unnamed: 1,ORF .1
0,YAL068C,,YAL025C
1,YAL067C,,YBL026W
2,YAL066W,,YBL097W
3,YAL065C,,YBR002C
4,YAL062W,,YBR049C


In [30]:
tested['orf'] = tested['ORF '].astype(str)

In [31]:
tested['orf'] = clean_orf(tested['orf'])

In [34]:
typo_fixes = {'YOLO57W':'YOL057W','YOLO62C':'YOL062C','YBRF182C-A':'YBR182C-A',
              'YLR287-A':'YLR287C-A','YJL206-A':'YJL206C-A'}

In [35]:
tested['orf'] = tested['orf'].apply(lambda x: typo_fixes[x] if x in typo_fixes.keys() else x)

In [36]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [37]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

            ORF   Unnamed: 1 ORF .1 orf
index_input                            
1963           .         NaN    NaN    


In [38]:
tested = tested.loc[t,:]

In [39]:
tested_orfs = np.unique(tested['orf'].values)

In [40]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

['YLR066W', 'YLR088W', 'YMR235C', 'YOL078W', 'YOR256C', 'YPL063W']

In [41]:
# Removing the missing strains (they were tested as DAMP strains, not deletions)
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [42]:
data = original_data.copy()

In [43]:
dataset_ids = [15997, 15998, 15999]
datasets = datasets.reindex(index=dataset_ids)

In [44]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [45]:
data.head()

dataset_id,15997,15998,15999
data_type,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
YAL002W,0,0,0
YAL004W,0,0,0
YAL005C,0,0,0
YAL007C,0,0,0
YAL008W,0,0,0


## Subset to the genes currently in SGD

In [46]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 27


In [47]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,15997,15998,15999
Unnamed: 0_level_1,data_type,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2,YAL002W,0,0,0
1863,YAL004W,0,0,0
4,YAL005C,0,0,0
5,YAL007C,0,0,0
6,YAL008W,0,0,0


# Normalize

In [48]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [49]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [50]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,15997,15998,15999,15997,15998,15999
Unnamed: 0_level_1,data_type,value,value,value,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2,YAL002W,0,0,0,0.0,0.0,0.0
1863,YAL004W,0,0,0,0.0,0.0,0.0
4,YAL005C,0,0,0,0.0,0.0,0.0
5,YAL007C,0,0,0,0.0,0.0,0.0
6,YAL008W,0,0,0,0.0,0.0,0.0


# Print out

In [51]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [52]:
from IO.save_data_to_db3 import *

In [53]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/3 [00:00<?, ?it/s]

Deleting all datasets for PMID 24722918...
Inserting the new data...


100%|██████████| 3/3 [00:25<00:00,  8.40s/it]

Updating the data_modified_on field...



