In [4]:
%run ../yp_utils.py

# Initial setup

In [5]:
paper_pmid = 23227207
paper_name = 'serviene_urbonavicius_2012' 

In [6]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [7]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [60]:
files = ['pone.0050779.s004.xls','pone.0050779.s005.xls']

In [61]:
def get_score(s):
    s = str(s).replace(" ","")
    scale = {'+/-':1,'+':2,'++':3,'+++':4}
    
    if s[0] == 'R':
        score = scale[s[1:]]
    elif s[0] == 'S':
        score = -scale[s[1:]]
    else:
        score = np.nan
    return score

In [62]:
original_data_list = []

for f  in files:
    original_data = pd.read_excel('raw_data/' + f, sheet_name='Sheet1', skiprows=10)
    print('Original data dimensions: %d x %d' % (original_data.shape))
    original_data['orf'] = original_data.iloc[:,0].astype(str)
    original_data['orf'] = clean_orf(original_data['orf'])
    original_data['orf'] = translate_sc(original_data['orf'], to='orf')
    t = looks_like_orf(original_data['orf'])
    print(original_data.loc[~t,])
    original_data = original_data.loc[t,:]
    
    for f in ['Unnamed: 3','Unnamed: 4','Unnamed: 5']:
        original_data[f] = original_data[f].apply(get_score)
        
    original_data.set_index('orf', inplace=True)
    original_data = original_data.loc[:,['Unnamed: 3','Unnamed: 4','Unnamed: 5']]
    original_data = original_data.groupby(original_data.index).mean()
    
    print(original_data.shape)
    original_data_list.append(original_data)

Original data dimensions: 238 x 9
                  Cell wall organization and biogenesis Unnamed: 1  \
index_input                                                          
11                                        Glycosylation        NaN   
19                  Membrane organization and transport        NaN   
36                      Transcription / gene expression        NaN   
48                                       RNA processing        NaN   
50                               Ribosome / translation        NaN   
55           Protein folding, modification, degradation        NaN   
67                                           Metabolism        NaN   
76                                           Cell cycle        NaN   
83                                        Mitochondrial        NaN   
99                                     Unknown function        NaN   
118                          Dubious open reading frame        NaN   
128                                                 NaN 

In [63]:
original_data1, original_data2 = original_data_list

In [64]:
original_data = original_data1.join(original_data2, how='outer', lsuffix='_1', rsuffix='_2')

In [66]:
original_data['data'] = original_data.sum(axis=1)

In [69]:
original_data.sort_values(by='data', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 3_1,Unnamed: 4_1,Unnamed: 5_1,Unnamed: 3_2,Unnamed: 4_2,Unnamed: 5_2,data
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
YNL322C,4.0,4.0,4.0,,,,12.0
YOR003W,4.0,4.0,3.0,,,,11.0
YNL309W,4.0,4.0,3.0,,,,11.0
YER166W,4.0,4.0,3.0,,,,11.0
YGR166W,4.0,4.0,3.0,,,,11.0


# Prepare the final dataset

In [70]:
data = original_data[['data']].copy()

In [71]:
dataset_ids = [16525]
datasets = datasets.reindex(index=dataset_ids)

In [72]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [73]:
data.head()

dataset_id,16525
data_type,value
orf,Unnamed: 1_level_2
YAL010C,-5.0
YAL019W,7.0
YAL021C,9.0
YAL023C,8.0
YAL026C,8.0


## Subset to the genes currently in SGD

In [74]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 2


In [75]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

In [76]:
data.head()

Unnamed: 0_level_0,dataset_id,16525
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
8,YAL010C,-5.0
17,YAL019W,7.0
19,YAL021C,9.0
21,YAL023C,8.0
24,YAL026C,8.0


# Normalize

In [77]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [78]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [79]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

In [80]:
data_all.head()


Unnamed: 0_level_0,dataset_id,16525,16525
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
8,YAL010C,-5.0,-2.381799
17,YAL019W,7.0,3.428421
19,YAL021C,9.0,4.396791
21,YAL023C,8.0,3.912606
24,YAL026C,8.0,3.912606


# Print out

In [81]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [82]:
from IO.save_data_to_db3 import *

In [83]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 23227207...
Inserting the new data...


100%|██████████| 1/1 [00:00<00:00,  1.42it/s]

Updating the data_modified_on field...





In [84]:
data_all.shape

(330, 2)