In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 12477387
paper_name = 'zhang_schneider_2002' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/Zhang et al supplemental data.xlsx', sheet_name='Size Data', header=None)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 5578 x 9


In [10]:
original_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,YAL068C,301,A,2.0,49.4838,51.666157,45.2132,,
1,YAL067C,301,A,3.0,49.2236,50.369021,27.8553,,
2,YAL066W,301,A,4.0,49.5164,51.766849,32.4166,,
3,YAL065C,301,A,5.0,47.8402,50.865318,38.8201,,
4,YAL062W,301,A,6.0,48.1596,50.039956,48.8672,,


In [11]:
original_data['orf'] = original_data[0].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [15]:
typo_fixes = {'TAL004W':'YAL004W','YELOO1C':'YEL001C','KL187C':'YKL187C'}
original_data['orf'] = original_data['orf'].apply(lambda x: typo_fixes[x] if x in typo_fixes.keys() else x)

In [16]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [17]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                         0         1    2     3        4           5        6  \
index_input                                                                     
345                 YMR41W       304    F   6.0  44.0874   47.079699  47.9361   
5561                   NaN       NaN  NaN   NaN      NaN         NaN      NaN   
5562                   NaN   Average  NaN   NaN  50.8196   52.318737   46.887   
5563                   NaN   Std Dev  NaN   NaN  6.68172    5.825714   8.9566   
5564                   NaN  Whi 1 SD  NaN   NaN  44.1379   46.493023  37.9304   
5565                   NaN   Uge 1SD  NaN   NaN  57.5013   58.144452  55.8436   
5566                   NaN  Whi 2 SD  NaN   NaN  37.4562   40.667309  28.9738   
5567                   NaN   Uge 2SD  NaN   NaN   64.183   63.970166  64.8002   
5568                   NaN       NaN  NaN   NaN      NaN         NaN      NaN   
5569                   NaN       NaN  NaN   NaN      NaN         NaN      NaN   
5570                   NaN  

In [18]:
original_data = original_data.loc[t,:]

In [19]:
original_data['data'] = pd.to_numeric(original_data[4], errors='coerce')

In [20]:
original_data.set_index('orf', inplace=True)

In [21]:
original_data = original_data[['data']].copy()

In [22]:
original_data = original_data.groupby(original_data.index).mean()

In [23]:
original_data.shape

(5437, 1)

In [24]:
# To separate HOM from HET data, use genes on today's HOM collection from Open Biosystems
hom = pd.read_excel('extras/Homozygous_diploid_obs_v7.0.xlsx', sheet_name='DATA')
hom.head()

Unnamed: 0,Record number,ORF,Batch,Collection,Plate,Row,Col,Comment
0,30916,YHL047C,chr8_1,Hom Dip,1,A,2,
1,30917,YHL046C,chr8_1,Hom Dip,1,A,3,
2,30918,YHL045W,chr8_1,Hom Dip,1,A,4,
3,30919,YHL044W,chr8_1,Hom Dip,1,A,5,
4,30920,YHL043W,chr8_1,Hom Dip,1,A,6,


In [25]:
hom['orf'] = hom['ORF'].astype(str)

In [26]:
hom['orf'] = clean_orf(hom['orf'])

In [27]:
hom['orf'] = translate_sc(hom['orf'], to='orf')

In [28]:
t = looks_like_orf(hom['orf'])
print(hom.loc[~t,])

Empty DataFrame
Columns: [Record number, ORF, Batch, Collection, Plate, Row, Col, Comment, orf]
Index: []


In [29]:
hom_orfs = hom['orf'].unique()

In [30]:
orfs_in_hom = [orf for orf in original_data.index.values if orf in hom_orfs]
orfs_in_het = [orf for orf in original_data.index.values if orf not in hom_orfs]

In [31]:
original_data1 = original_data.loc[orfs_in_hom,:].copy()
original_data2 = original_data.loc[orfs_in_het,:].copy()

In [32]:
original_data = original_data1.join(original_data2, how='outer', lsuffix='_1', rsuffix='_2')

In [33]:
original_data.head()

Unnamed: 0_level_0,data_1,data_2
orf,Unnamed: 1_level_1,Unnamed: 2_level_1
YAL001C,,41.109257
YAL002W,54.292354,
YAL003W,,38.903418
YAL004W,53.087772,
YAL005C,54.292182,


# Prepare the final dataset

In [34]:
data = original_data.copy()

In [35]:
dataset_ids = [477, 5384]
datasets = datasets.reindex(index=dataset_ids)

In [36]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [37]:
data.head()

dataset_id,477,5384
data_type,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2
YAL001C,,41.109257
YAL002W,54.292354,
YAL003W,,38.903418
YAL004W,53.087772,
YAL005C,54.292182,


## Subset to the genes currently in SGD

In [38]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 36


In [41]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,477,5384
Unnamed: 0_level_1,data_type,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
1,YAL001C,,41.109257
2,YAL002W,54.292354,
3,YAL003W,,38.903418
1863,YAL004W,53.087772,
4,YAL005C,54.292182,


# Normalize

In [42]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [43]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [44]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,477,5384,477,5384
Unnamed: 0_level_1,data_type,value,value,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,YAL001C,,41.109257,,-1.613651
2,YAL002W,54.292354,,0.639609,
3,YAL003W,,38.903418,,-2.058205
1863,YAL004W,53.087772,,0.471617,
4,YAL005C,54.292182,,0.639585,


# Print out

In [45]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [46]:
from IO.save_data_to_db3 import *

In [47]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/2 [00:00<?, ?it/s]

Deleting all datasets for PMID 12477387...
Inserting the new data...


100%|██████████| 2/2 [00:14<00:00,  7.30s/it]

Updating the data_modified_on field...



