In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 16330752
paper_name = 'proszynski_walch_2005' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/09107Table2.xlsx', sheet_name='Sheet1', skiprows=2)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 146 x 4


In [10]:
original_data.head()

Unnamed: 0,Phenotype,ORF,Group,Protein
0,II,YJL139C,GLYCAN,Yeast YUR1
1,I and II,YDR050C,GLYCAN,Yeast TPI1
2,I and II,YMR205C,GLYCAN,Yeast PFK2
3,I,YBR183W,LM,Yeast YPC1
4,I,YDR297W,LM,Yeast SUR2


In [11]:
original_data['orf'] = original_data['ORF'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [13]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [14]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                     Phenotype  ORF Group  \
index_input                                                                 
137                                                        NaN  NaN   NaN   
138          "I and II" - both types of phenotype were obse...  NaN   NaN   
139                                                        NaN  NaN   NaN   
140                                                        NaN  NaN   NaN   
141                                                        NaN  NaN   NaN   
142                                                        NaN  NaN   NaN   
143                                                        NaN  NaN   NaN   
144                                                        NaN  NaN   NaN   
145                                                        NaN  NaN   NaN   

            Protein  orf  
index_input               
137             NaN  NAN  
138             NaN  NAN  
139             NaN  NAN  
140             N

In [15]:
original_data = original_data.loc[t,:]

In [17]:
original_data['I'] = original_data['Phenotype'].apply(lambda x: 1 if x in ['I','I and II','II and I'] else 0)
original_data['II'] = original_data['Phenotype'].apply(lambda x: 1 if x in ['II','I and II','II and I'] else 0)


In [18]:
original_data.set_index('orf', inplace=True)

In [19]:
original_data = original_data[['I','II']].copy()

In [20]:
original_data = original_data.groupby(original_data.index).mean()

In [21]:
original_data.shape

(137, 2)

In [22]:
original_data.head()

Unnamed: 0_level_0,I,II
orf,Unnamed: 1_level_1,Unnamed: 2_level_1
YAL020C,0,1
YAL042W,0,1
YAL046C,1,0
YAL066W,0,1
YBL038W,0,1


# Prepare the final dataset

In [23]:
data = original_data.copy()

In [24]:
dataset_ids = [180, 5657]
datasets = datasets.reindex(index=dataset_ids)

In [25]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [26]:
data.head()

dataset_id,180,5657
data_type,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2
YAL020C,0,1
YAL042W,0,1
YAL046C,1,0
YAL066W,0,1
YBL038W,0,1


## Subset to the genes currently in SGD

In [27]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [28]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,180,5657
Unnamed: 0_level_1,data_type,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
18,YAL020C,0,1
40,YAL042W,0,1
44,YAL046C,1,0
60,YAL066W,0,1
126,YBL038W,0,1


# Normalize

In [29]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [30]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [31]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,180,5657,180,5657
Unnamed: 0_level_1,data_type,value,value,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
18,YAL020C,0,1,0.0,8.953094
40,YAL042W,0,1,0.0,8.953094
44,YAL046C,1,0,6.898822,0.0
60,YAL066W,0,1,0.0,8.953094
126,YBL038W,0,1,0.0,8.953094


# Print out

In [32]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [33]:
from IO.save_data_to_db3 import *

In [34]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/2 [00:00<?, ?it/s]

Deleting all datasets for PMID 16330752...
Inserting the new data...


100%|██████████| 2/2 [00:00<00:00,  2.92it/s]

Updating the data_modified_on field...



