In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 16630279
paper_name = 'snoek_steensma_2006' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_csv('raw_data/hom_hits.txt', sep='\t', header=None)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 24 x 3


In [10]:
original_data.head()

Unnamed: 0,0,1,2
0,YAL026C,DRS2,Integral membrane Ca(2+)-ATPase
1,YPL254W,HFI1,Subunit of SAGA
2,YBR179C,FZO1,Mitochondrial integral membrane protein
3,YDR138W,HPR1,Subunit of THO/TREX
4,YDR364C,CDC40,Splicing factor


In [11]:
original_data['orf'] = original_data[0].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [13]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [14]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [0, 1, 2, orf]
Index: []


In [15]:
original_data['data'] = -1

In [16]:
original_data.set_index('orf', inplace=True)

In [17]:
original_data = original_data[['data']].copy()

In [18]:
original_data = original_data.groupby(original_data.index).mean()

In [19]:
original_data.shape

(23, 1)

# Load & process tested strains

In [22]:
tested = pd.read_csv('raw_data/Homo_diploids_041902.txt', sep='\t', skiprows=1)

In [23]:
tested.head()

Unnamed: 0,record no.,ORF name,strain,batch,plate,row,column,RG NOTES,QC NOTES
0,30338.0,YAL068C,HOM DIP,01_1,301.0,A,2,,
1,30339.0,YAL067C,HOM DIP,01_1,301.0,A,3,,
2,30340.0,YAL066W,HOM DIP,01_1,301.0,A,4,,
3,30341.0,YAL065C,HOM DIP,01_1,301.0,A,5,,
4,30345.0,YAL062W,HOM DIP,01_1,301.0,A,6,,


In [24]:
tested['orf'] = tested['ORF name'].astype(str)

In [25]:
tested['orf'] = clean_orf(tested['orf'])

In [28]:
tested.loc[tested['orf']=='YELOO1C','orf'] = 'YEL001C'

In [29]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [30]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

             record no. ORF name   strain batch  plate  row   column RG NOTES  \
index_input                                                                     
346             30827.0   YMR41W  HOM DIP  13_5  304.0    F        6      NaN   
3449                NaN      NaN      NaN   NaN    NaN  NaN  End 337      NaN   
4492                NaN      NaN      NaN   NaN    NaN  NaN  end 349      NaN   
4646                NaN      NaN      NaN   NaN    NaN  NaN  end 371      NaN   
4713                NaN      NaN      NaN   NaN    NaN  NaN  end 372      NaN   
4785                NaN      NaN      NaN   NaN    NaN  NaN  end 380      NaN   
4797                NaN      NaN      NaN   NaN    NaN  NaN  end 381      NaN   

            QC NOTES     orf  
index_input                   
346              NaN  YMR41W  
3449             NaN     NAN  
4492             NaN     NAN  
4646             NaN     NAN  
4713             NaN     NAN  
4785             NaN     NAN  
4797             NaN  

In [31]:
tested = tested.loc[t,:]

In [32]:
tested_orfs = tested['orf'].unique()

In [33]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

[]

In [34]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [35]:
data = original_data.copy()

In [36]:
dataset_ids = [498]
datasets = datasets.reindex(index=dataset_ids)

In [37]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [38]:
data.head()

dataset_id,498
data_type,value
orf,Unnamed: 1_level_2
YAL068C,0
YAL067C,0
YAL066W,0
YAL065C,0
YAL062W,0


## Subset to the genes currently in SGD

In [39]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [40]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,498
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
1869,YAL068C,0
61,YAL067C,0
60,YAL066W,0
1727,YAL065C,0
57,YAL062W,0


# Normalize

In [41]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [42]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [43]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,498,498
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
1869,YAL068C,0,0.0
61,YAL067C,0,0.0
60,YAL066W,0,0.0
1727,YAL065C,0,0.0
57,YAL062W,0,0.0


# Print out

In [44]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [45]:
from IO.save_data_to_db3 import *

In [46]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 16630279...
Inserting the new data...


100%|██████████| 1/1 [00:07<00:00,  7.81s/it]

Updating the data_modified_on field...



