In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 12096123
paper_name = 'wilson_roach_2002' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/suppdata.xlsx', sheet_name='SuppDataREV', skiprows=1)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 699 x 6


In [10]:
original_data.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,GAC1,Low,YOR178C,Regulatory subunit for protein serine/threonin...,,
1,GLC3,Low (very),YEL011W,"Alpha-1,4-glucan branching enzyme",,
2,GLC8,Low,YMR311C,Modulator of protein serine/threonine phosphat...,,
3,GSY2,Low (very),YLR258W,UDP-glucose-starch glucosyltransferase (glycog...,,
4,IRA2 (GLC4),Low (very),YOL081W,GTPase-activating protein for Ras1p (WT) and R...,,


In [11]:
original_data['orf'] = original_data['Unnamed: 2'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [16]:
original_data.loc[original_data['orf']=='YORO36W','orf'] = 'YOR036W'

In [17]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [18]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                               Unnamed: 0 Unnamed: 1 Unnamed: 2 Unnamed: 3  \
index_input                                                                  
8                                     NaN        NaN        NaN        NaN   
14                                    NaN        NaN        NaN        NaN   
15                                    NaN        NaN        NaN        NaN   
16           Vesicle transport associated        NaN        NaN        NaN   
17                                    NaN        NaN        NaN        NaN   
...                                   ...        ...        ...        ...   
694                                   NaN        NaN        NaN        NaN   
695                                   NaN        NaN        NaN        NaN   
696                                   NaN        NaN        NaN        NaN   
697                                   NaN        NaN        NaN        NaN   
698                                   NaN        NaN        NaN 

In [20]:
original_data = original_data.loc[t,:]

In [22]:
data_switch = {'High': 1, 'High (pink)': 1, 'High (very)': 2, 'Low': -1, 'Low (very)': -2}
original_data['data'] = original_data['Unnamed: 1'].apply(lambda x: data_switch[x.strip()])

In [23]:
original_data.set_index('orf', inplace=True)

In [24]:
original_data = original_data[['data']].copy()

In [25]:
original_data = original_data.groupby(original_data.index).mean()

In [26]:
original_data.shape

(559, 1)

# Load & process tested strains

In [29]:
tested = pd.read_excel('raw_data/ResGen Diploids inventory.xlsx', sheet_name='Inventory', skiprows=1)

In [30]:
tested.head()

Unnamed: 0,ORF name,plate,row,column
0,YAL068C,301,A,2
1,YAL067C,301,A,3
2,YAL066W,301,A,4
3,YAL065C,301,A,5
4,YAL062W,301,A,6


In [31]:
tested['orf'] = tested['ORF name'].astype(str)

In [32]:
tested['orf'] = clean_orf(tested['orf'])

In [35]:
typos = {'TAL004W':'YAL004W','YELOO1C':'YEL001C','KL187C':'YKL187C'}
tested['orf'] = tested['orf'].apply(lambda x: typos[x] if x in typos.keys() else x)

In [36]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [37]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

            ORF name  plate row  column     orf
index_input                                    
346           YMR41W    304   F       6  YMR41W


In [38]:
tested = tested.loc[t,:]

In [39]:
tested_orfs = tested['orf'].unique()

In [40]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

[]

In [41]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [42]:
data = original_data.copy()

In [43]:
dataset_ids = [4949]
datasets = datasets.reindex(index=dataset_ids)

In [44]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [45]:
data.head()

dataset_id,4949
data_type,value
orf,Unnamed: 1_level_2
YAL068C,0.0
YAL067C,0.0
YAL066W,0.0
YAL065C,0.0
YAL062W,0.0


## Subset to the genes currently in SGD

In [46]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 27


In [47]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,4949
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
1869,YAL068C,0.0
61,YAL067C,0.0
60,YAL066W,0.0
1727,YAL065C,0.0
57,YAL062W,0.0


# Normalize

In [48]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [49]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [50]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,4949,4949
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
1869,YAL068C,0.0,0.046277
61,YAL067C,0.0,0.046277
60,YAL066W,0.0,0.046277
1727,YAL065C,0.0,0.046277
57,YAL062W,0.0,0.046277


# Print out

In [51]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [52]:
from IO.save_data_to_db3 import *

In [53]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 12096123...
Inserting the new data...


100%|██████████| 1/1 [00:07<00:00,  7.69s/it]

Updating the data_modified_on field...



