In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 16251355
paper_name = 'rand_grant_2006' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/Table1.xlsx', sheet_name='Sheet1')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 212 x 3


In [7]:
original_data.head()

Unnamed: 0,Gene,ORF,Function
0,Protein synthesis,,
1,RPL31A,YDL075W,60S large subunit ribosomal protein L31.e
2,RPP1B,YDL130W,60S large subunit acidic ribosomal protein L44...
3,RPL35A,YDL191W,60S large subunit ribosomal protein
4,RPL1B,YGL135W,60S large subunit ribosomal protein


In [14]:
original_data['orf'] = original_data[original_data.columns[1]].astype(str)

In [15]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [16]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [17]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                  Gene  ORF  Function   orf
index_input                                                                
0                                    Protein synthesis   NaN       NaN  NAN
19                        Transcription/RNA processing   NaN       NaN  NAN
44                                   Secretory pathway   NaN       NaN  NAN
45                            Vacuolar protein sorting   NaN       NaN  NAN
62                                       Glycosylation   NaN       NaN  NAN
68           Vesicular transport (Golgi network, etc.)   NaN       NaN  NAN
80                      Cell cycle and differentiation   NaN       NaN  NAN
106                                         Metabolism   NaN       NaN  NAN
107                                  Carbon metabolism   NaN       NaN  NAN
119                              Amino acid metabolism   NaN       NaN  NAN
127                                   Lipid metabolism   NaN       NaN  NAN
133         

In [18]:
original_data = original_data.loc[t,:]

In [19]:
original_data['data'] = -1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
original_data.set_index('orf', inplace=True)

In [21]:
original_data = original_data[['data']].copy()

In [22]:
original_data = original_data.groupby(original_data.index).mean()

In [23]:
original_data.shape

(195, 1)

# Load & process tested strains

In [24]:
tested = pd.read_excel('raw_data/homozygous_diploid_obs_v1(1).0.xlsx', sheet_name='homozygous diploid_obs')

In [25]:
tested.head()

Unnamed: 0,record number,ORF,batch,Hom Dip prc,Comment
0,30916,YHL047C,chr8_1,Hom Dip 1A2,
1,30917,YHL046C,chr8_1,Hom Dip 1A3,
2,30918,YHL045W,chr8_1,Hom Dip 1A4,
3,30919,YHL044W,chr8_1,Hom Dip 1A5,
4,30920,YHL043W,chr8_1,Hom Dip 1A6,


In [26]:
tested['orf'] = tested['ORF'].astype(str)

In [27]:
tested['orf'] = clean_orf(tested['orf'])

In [28]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [29]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

            record number  ORF batch Hom Dip prc  \
index_input                                        
3600                  NaN  NaN   NaN         NaN   
3788                  NaN  NaN   NaN         NaN   
4761                  NaN  NaN   NaN         NaN   

                                               Comment  orf  
index_input                                                  
3600                        Plates 39-48 do not exist.  NAN  
3788         There is no plate 51.  It does not exist.  NAN  
4761         There is no plate 64.  It does not exist.  NAN  


In [30]:
tested = tested.loc[t,:]

In [31]:
tested_orfs = tested['orf'].unique()

In [32]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

[]

In [33]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [34]:
data = original_data.copy()

In [35]:
dataset_ids = [1305]
datasets = datasets.reindex(index=dataset_ids)

In [36]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [37]:
data.head()

dataset_id,1305
data_type,value
orf,Unnamed: 1_level_2
YHL047C,0
YHL046C,0
YHL045W,0
YHL044W,0
YHL043W,0


## Subset to the genes currently in SGD

In [38]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [39]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,1305
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
964,YHL047C,0
963,YHL046C,0
962,YHL045W,0
961,YHL044W,0
960,YHL043W,0


# Normalize

In [40]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [41]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [42]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,1305,1305
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
964,YHL047C,0,0.0
963,YHL046C,0,0.0
962,YHL045W,0,0.0
961,YHL044W,0,0.0
960,YHL043W,0,0.0


# Print out

In [43]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [44]:
from IO.save_data_to_db3 import *

In [45]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 16251355...
Inserting the new data...


100%|██████████| 1/1 [00:08<00:00,  8.31s/it]

Updating the data_modified_on field...



