In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 21167225
paper_name = 'kitagawa_imaeda_2011' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/table1.xlsx', sheet_name='Table 1', skiprows=5, header=None)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 64 x 3


In [10]:
original_data.head()

Unnamed: 0,0,1,2
0,MON2,YNL297C,Peripheral membrane protein with a role in end...
1,RAV1,YJR033C,"Subunit of the RAVE complex (Rav1p, Rav2p, Skpip)"
2,SNF7,YLR025W,One of four subunits of the ESCRT-III complex
3,SNF8,YPL002C,Component of the ESCRT-II complex
4,VMA7,YGR020C,Subunit F of the eight-subunit V1 peripheral m...


In [11]:
original_data['orf'] = original_data[1].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [13]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [14]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                             0    1    2  orf
index_input                                                                  
18                                           ER/Golgi function  NaN  NaN  NAN
24                                      Phospholipid synthesis  NaN  NaN  NAN
29                                               Transcription  NaN  NaN  NAN
34                                                 Translation  NaN  NaN  NAN
44                                    Nitrogen source response  NaN  NaN  NAN
48                                                Cytoskeleton  NaN  NaN  NAN
51                                                      Others  NaN  NaN  NAN
62           ykl118w� strains were selected for enzyme assa...  NaN  NaN  NAN
63           water.  The  gcn5�,  opi3�,  per1�,  vma7�,  a...  NaN  NaN  NAN


In [15]:
original_data = original_data.loc[t,:]

In [16]:
original_data['data'] = 1

In [17]:
original_data.set_index('orf', inplace=True)

In [18]:
original_data = original_data[['data']].copy()

In [19]:
original_data = original_data.groupby(original_data.index).mean()

In [20]:
original_data.shape

(55, 1)

# Prepare the final dataset

In [21]:
data = original_data.copy()

In [22]:
dataset_ids = [512]
datasets = datasets.reindex(index=dataset_ids)

In [23]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [24]:
data.head()

dataset_id,512
data_type,value
orf,Unnamed: 1_level_2
YBL027W,1
YBR036C,1
YCR044C,1
YCR047C,1
YDR065W,1


## Subset to the genes currently in SGD

In [25]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [26]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,512
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
115,YBL027W,1
232,YBR036C,1
601,YCR044C,1
604,YCR047C,1
2192,YDR065W,1


# Normalize

In [27]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [28]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [29]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,512,512
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
115,YBL027W,1,9.103446
232,YBR036C,1,9.103446
601,YCR044C,1,9.103446
604,YCR047C,1,9.103446
2192,YDR065W,1,9.103446


# Print out

In [30]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [31]:
from IO.save_data_to_db3 import *

In [32]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 21167225...
Inserting the new data...


100%|██████████| 1/1 [00:00<00:00,  5.97it/s]


Updating the data_modified_on field...
