In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 23383298
paper_name = 'michaillat_mayer_2013' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/Table_S1.xlsx', sheet_name='Sheet1', skiprows=2)

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 2653 x 14


In [7]:
original_data.head()

Unnamed: 0,ORF,gene name(s),function,Unnamed: 3,round 1,round 2,round 2bis,round 2ter,round 3,SUM,times phenotype,times seen,sum/times seen,x ph. /x seen
0,YHR091C,MSR1,Mitochondrial arginyl-tRNA synthetase: overlap...,,,3.0,4.0,,,7,2,2,3.5,1.0
1,YOL030W,GAS5,",3-beta-glucanosyltransferase",,2.0,4.0,4.0,3.0,3.0,16,5,5,3.2,1.0
2,YBR081C,"SPT7, GIT2",Subunit of the SAGA transcriptional Subunit of...,,,,,3.0,,3,1,1,3.0,1.0
3,YGR206W,MVB12,ESCRT-I subunit necessary for the efficient tr...,,4.0,2.0,2.0,,4.0,12,4,4,3.0,1.0
4,YKL061W,BLI1,likely member of BLOC complex involved in endo...,,,,,,3.0,3,1,1,3.0,1.0


In [8]:
original_data['orf'] = original_data['ORF'].astype(str)

In [9]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [10]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [11]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

             ORF gene name(s)           function  Unnamed: 3 round 1 round 2  \
index_input                                                                    
881          NaN         MSB3                NaN         NaN       0       0   
882          NaN         AFT2                NaN         NaN       0       0   
883          NaN         CSM4                NaN         NaN       0       0   
884          NaN        PDR18                NaN         NaN       0       0   
1201         NaN         GSH2                NaN         NaN     NaN       0   
1202         NaN         PSY2                NaN         NaN       0       0   
1203         NaN         SLZ1                NaN         NaN       0       0   
1204         NaN         SCS1                NaN         NaN       0       0   
1337         NaN       RPS10A  ribosomal protein         NaN       0       0   
1338         NaN         STD1                NaN         NaN       0       0   
1339         NaN         GLN3           

In [12]:
original_data.loc[~t,'orf'] = translate_sc(original_data.loc[~t,'gene name(s)'], to='orf').values

In [13]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [ORF, gene name(s), function, Unnamed: 3, round 1, round 2, round 2bis, round 2ter, round 3, SUM, times phenotype, times seen,  sum/times seen, x ph. /x seen, orf]
Index: []


In [14]:
# Flipping the sign to reflect that fact that high score = high loss of vacuole fragmentation.
original_data['data'] = -original_data[' sum/times seen']

In [15]:
original_data.set_index('orf', inplace=True)

In [16]:
original_data = original_data[['data']].copy()

In [17]:
original_data = original_data.groupby(original_data.index).mean()

In [18]:
original_data.shape

(2617, 1)

# Load & process tested strains

In [19]:
tested = pd.read_excel('raw_data/KO_collection.xlsx', sheet_name='KOllection', skiprows=17)

In [20]:
tested.head()

Unnamed: 0,Plate,row,col,record #,ORF,gene name(s),SL/wt: Boone,Growth,slow growth,resp.deficient,C-source util. Def.,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,,,,,,,,,MIPS,MIPS,MIPS,,,,,,,,
1,,,,,,,,,,,,,,,,,,,
2,1.0,A,1.0,338.0,YAL068C,,,v,,,,,,,,,,,
3,1.0,A,2.0,339.0,YAL067C,SEO1,,v,,,,,,,,,,,
4,1.0,A,3.0,340.0,YAL066W,,,v,,,,,,,,,,,


In [21]:
tested['orf'] = tested['ORF'].astype(str)

In [22]:
tested['orf'] = clean_orf(tested['orf'])

In [23]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [24]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

            Plate  row  col record #  ORF gene name(s) SL/wt: Boone Growth  \
index_input                                                                  
0             NaN  NaN  NaN      NaN  NaN          NaN          NaN    NaN   
1             NaN  NaN  NaN      NaN  NaN          NaN          NaN    NaN   
14            NaN  NaN  NaN      NaN  NaN          NaN          NaN    NaN   
27            NaN  NaN  NaN      NaN  NaN          NaN          NaN    NaN   
40            NaN  NaN  NaN      NaN  NaN          NaN          NaN    NaN   
...           ...  ...  ...      ...  ...          ...          ...    ...   
5276          NaN  NaN  NaN      NaN  NaN          NaN          NaN    NaN   
5289          NaN  NaN  NaN      NaN  NaN          NaN          NaN    NaN   
5302          NaN  NaN  NaN      NaN  NaN          NaN          NaN    NaN   
5315          NaN  NaN  NaN      NaN  NaN          NaN          NaN    NaN   
5328          NaN  NaN  NaN      NaN  NaN          NaN          

In [25]:
tested = tested.loc[t,:]

In [26]:
tested_orfs = tested['orf'].unique()

In [27]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

['YDR062W']

In [28]:
tested_orfs = list(tested_orfs) + missing

In [29]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [30]:
data = original_data.copy()

In [31]:
dataset_ids = [15986]
datasets = datasets.reindex(index=dataset_ids)

In [32]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [33]:
data.head()

dataset_id,15986
data_type,value
orf,Unnamed: 1_level_2
YAL068C,-0.25
YAL067C,-1.0
YAL066W,-0.333333
YAL065C,0.0
YAL062W,0.0


## Subset to the genes currently in SGD

In [34]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [35]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,15986
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
1869,YAL068C,-0.25
61,YAL067C,-1.0
60,YAL066W,-0.333333
1727,YAL065C,0.0
57,YAL062W,0.0


# Normalize

In [36]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [37]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [38]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,15986,15986
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
1869,YAL068C,-0.25,-0.389888
61,YAL067C,-1.0,-1.559553
60,YAL066W,-0.333333,-0.519851
1727,YAL065C,0.0,0.0
57,YAL062W,0.0,0.0


# Print out

In [39]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [40]:
from IO.save_data_to_db3 import *

In [41]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 23383298...
Inserting the new data...


100%|██████████| 1/1 [00:07<00:00,  7.93s/it]

Updating the data_modified_on field...



