In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 22140548
paper_name = 'franzosa_mcclellan_2011' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/journal.pone.0028211.s003.xlsx', sheet_name='Sheet1', skiprows=2)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 5473 x 2


In [10]:
original_data.head()

Unnamed: 0,Ranked Genes,Average Ratio values
0,IAH1,0.26144
1,YBR197C,0.31723
2,COQ6,0.3656
3,HSL7,0.38074
4,YPL238C,0.42343


In [11]:
original_data['gene'] = original_data['Ranked Genes'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['gene'] = clean_genename(original_data['gene'])

In [15]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['gene'].values, to='orf')

In [17]:
original_data.loc[original_data['orf']=='TOS7','orf'] = 'YOL019W'

In [18]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

            Ranked Genes  Average Ratio values   gene    orf
index_input                                                 
109            AMI3                    0.66029   AMI3   AMI3
230            PMN1                    0.73308   PMN1   PMN1
420            CRS5                    0.78527   CRS5   CRS5
601            SDL1                    0.82175   SDL1   SDL1
742            Rsa2                    0.83917   RSA2   RSA2
897            aad6                    0.85937   AAD6   AAD6
929            SDC25                   0.86273  SDC25  SDC25
1075           TOS9                    0.87638   TOS9   TOS9
1400           TCI1                    0.89758   TCI1   TCI1
1794           TOS5                    0.92241   TOS5   TOS5
1877           HXT12                   0.92705  HXT12  HXT12
2034           BOP1                    0.93530   BOP1   BOP1
2054           HSN1                    0.93628   HSN1   HSN1
2146           EFR4                    0.94020   EFR4   EFR4
2370           SRI1     

In [19]:
original_data = original_data.loc[t,:]

In [20]:
original_data['data'] = original_data['Average Ratio values']

In [21]:
original_data.set_index('orf', inplace=True)

In [22]:
original_data = original_data[['data']].copy()

In [23]:
original_data = original_data.groupby(original_data.index).mean()

In [24]:
original_data.shape

(5412, 1)

# Prepare the final dataset

In [25]:
data = original_data.copy()

In [26]:
dataset_ids = [11784]
datasets = datasets.reindex(index=dataset_ids)

In [27]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [28]:
data.head()

dataset_id,11784
data_type,value
orf,Unnamed: 1_level_2
YAL001C,1.0746
YAL002W,0.7203
YAL003W,1.0701
YAL004W,1.7355
YAL005C,1.4918


## Subset to the genes currently in SGD

In [29]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 17


In [30]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,11784
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
1,YAL001C,1.0746
2,YAL002W,0.7203
3,YAL003W,1.0701
1863,YAL004W,1.7355
4,YAL005C,1.4918


# Normalize

In [31]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [32]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [33]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,11784,11784
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
1,YAL001C,1.0746,0.608704
2,YAL002W,0.7203,-1.379749
3,YAL003W,1.0701,0.583449
1863,YAL004W,1.7355,4.317902
4,YAL005C,1.4918,2.950174


# Print out

In [34]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [35]:
from IO.save_data_to_db3 import *

In [36]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 22140548...
Inserting the new data...


100%|██████████| 1/1 [00:08<00:00,  8.97s/it]

Updating the data_modified_on field...



