In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 21880895
paper_name = 'shi_emr_2011' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/supp_E11-05-0440_mc-E11-05-0440-s03.xlsx', sheet_name='Sheet1', skiprows=1)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 211 x 5


In [10]:
original_data.head()

Unnamed: 0.1,Unnamed: 0,ORF ID1,Gene1,Functiona,Orthologsb
0,ORFs of unknown or poorly characterized functi...,,,,
1,,YAR020C,CVR1/PAU7,Part of 23-member seripauperin multigene family,Sc
2,,YAR023C,CVR2,Putative integral membrane protein of unknown ...,Sc
3,,YAR027W,CVR3/UIP3,Putative integral membrane protein of unknown ...,Sc
4,,YAR028W,CVR4,Putative integral membrane protein of unknown ...,Sc


In [11]:
original_data['orf'] = original_data['ORF ID1'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [13]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [14]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                    Unnamed: 0 ORF ID1 Gene1  \
index_input                                                                    
0            ORFs of unknown or poorly characterized functi...     NaN   NaN   
48           ORFs of unknown or poorly characterized functi...     NaN   NaN   
54                                                         NaN     NaN   NaN   
55                 Genes involved in membrane trafficking (28)     NaN   NaN   
78           Genes involved in membrane trafficking (28 tot...     NaN   NaN   
84                                                         NaN     NaN   NaN   
85                                                         NaN     NaN   NaN   
86                     Genes involved in lipid metabolism (10)     NaN   NaN   
97                                                         NaN     NaN   NaN   
98                                                         NaN     NaN   NaN   
99                  Genes involved in ub

In [15]:
original_data = original_data.loc[t,:]

In [16]:
original_data['data'] = 1

In [17]:
original_data.set_index('orf', inplace=True)

In [18]:
original_data = original_data[['data']].copy()

In [19]:
original_data = original_data.groupby(original_data.index).mean()

In [20]:
original_data.shape

(169, 1)

# Prepare the final dataset

In [21]:
data = original_data.copy()

In [22]:
dataset_ids = [148]
datasets = datasets.reindex(index=dataset_ids)

In [23]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [24]:
data.head()

dataset_id,148
data_type,value
orf,Unnamed: 1_level_2
YAL007C,1
YAL008W,1
YAL009W,1
YAL022C,1
YAR003W,1


## Subset to the genes currently in SGD

In [25]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 1


In [26]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,148
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
5,YAL007C,1
6,YAL008W,1
7,YAL009W,1
20,YAL022C,1
63,YAR003W,1


# Normalize

In [27]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [28]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [29]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,148,148
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
5,YAL007C,1,5.212736
6,YAL008W,1,5.212736
7,YAL009W,1,5.212736
20,YAL022C,1,5.212736
63,YAR003W,1,5.212736


# Print out

In [30]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [31]:
from IO.save_data_to_db3 import *

In [32]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 21880895...
Inserting the new data...


100%|██████████| 1/1 [00:00<00:00,  2.13it/s]

Updating the data_modified_on field...



