In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 11907266
paper_name = 'dimmer_westermann_2002' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [18]:
files = ['no_growth.txt','poor_growth.txt']
scores = [-1, -0.5]

In [19]:
original_data_list = []
for ixf, f in enumerate(files):
    original_data = pd.read_csv('raw_data/' + f, sep='\n', header=None)
    print('Original data dimensions: %d x %d' % (original_data.shape))
    print(original_data.head())
    
    original_data['orf'] = original_data[0].apply(lambda x: re.split(' |\t',x)[0])
    original_data['orf'] = original_data['orf'].astype(str)
    original_data['orf'] = clean_orf(original_data['orf'])
    original_data['orf'] = translate_sc(original_data['orf'], to='orf')
    t = looks_like_orf(original_data['orf'])
    print(original_data.loc[~t,])
    
    original_data = original_data.loc[t,:]
    original_data['data'] = scores[ixf]
    original_data.set_index('orf', inplace=True)
    original_data = original_data[['data']].copy()
    original_data = original_data.groupby(original_data.index).mean()
    
    print(original_data.shape)
    
    original_data_list.append(original_data)

Original data dimensions: 379 x 1
                                                   0
0  YCR028C-A RIM1, binds single-stranded DNA, req...
1  YDR296W\tMHR1, involved in repair, recombinati...
2  YHR120W MSH1, involved in mitochondrial DNA re...
3  YJR144W MGM101, mitochondrial genome maintenan...
4  YML061C PIF1, single-stranded DNA-dependent AT...
                                                             0            orf
index_input                                                                  
6                                      of mitochondrial genome             OF
8                                         mitochondrial genome  MITOCHONDRIAL
12                            group II introns of COX1 and COB          GROUP
14                                      localization predicted   LOCALIZATION
17                             mitochondrial rRNA (LSU) intron  MITOCHONDRIAL
24                                    mitochondrial biogenesis  MITOCHONDRIAL
112                     

In [22]:
original_data = pd.concat(original_data_list, axis=0)
original_data = original_data.groupby(original_data.index).mean()

In [23]:
original_data.head()

Unnamed: 0_level_0,data
orf,Unnamed: 1_level_1
YAL010C,-1.0
YAL012W,-1.0
YAL016W,-1.0
YAL039C,-1.0
YAL044C,-1.0


In [24]:
original_data.shape

(403, 1)

# Load & process tested strains

In [55]:
tested = pd.read_csv('raw_data/HOMOZYGOUS DIPLOID 1+2 ResGen.txt', sep=' ')

In [56]:
tested.head()

Unnamed: 0,HOMOZYGOUS,DIPLOID,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59
0,record,no.,,,,,,ORF,name,,...,,,,,,,,,,
1,30338,,,YAL068C,HOM,DIP,01_1,,,,...,,,,,,,,,,
2,30339,,,YAL067C,HOM,DIP,01_1,,,,...,,,,,,,,,,
3,30340,,,YAL066W,HOM,DIP,01_1,,,,...,,,,,,,,,,
4,30341,,,YAL065C,HOM,DIP,01_1,,,,...,,,,,,,,,,


In [57]:
tested['orf'] = tested['Unnamed: 3'].astype(str)

In [58]:
tested['orf'] = clean_orf(tested['orf'])

In [59]:
tested.loc[tested['orf']=='YELOO1C','orf'] = 'YEL001C'

In [60]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [61]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

            HOMOZYGOUS DIPLOID  Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5  \
index_input                                                                   
0               record     no.         NaN        NaN        NaN        NaN   
347              30827     NaN         NaN     YMR41W        NaN        HOM   
3450               NaN     NaN         NaN        NaN        NaN        NaN   
4493               NaN     NaN         NaN        NaN        NaN        NaN   
4647               NaN     NaN         NaN        NaN        NaN        NaN   
4714               NaN     NaN         NaN        NaN        NaN        NaN   
4786               NaN     NaN         NaN        NaN        NaN        NaN   
4798               NaN     NaN         NaN        NaN        NaN        NaN   

            Unnamed: 6 Unnamed: 7 Unnamed: 8  Unnamed: 9  ... Unnamed: 51  \
index_input                                               ...               
0                  NaN        ORF       name         Na

In [62]:
tested = tested.loc[t,:]

In [63]:
tested_orfs = tested['orf'].unique()

In [64]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

[]

In [65]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [66]:
data = original_data.copy()

In [67]:
dataset_ids = [470]
datasets = datasets.reindex(index=dataset_ids)

In [68]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [69]:
data.head()

dataset_id,470
data_type,value
orf,Unnamed: 1_level_2
YAL068C,0.0
YAL067C,0.0
YAL066W,0.0
YAL065C,0.0
YAL062W,0.0


## Subset to the genes currently in SGD

In [70]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [71]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,470
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
1869,YAL068C,0.0
61,YAL067C,0.0
60,YAL066W,0.0
1727,YAL065C,0.0
57,YAL062W,0.0


# Normalize

In [72]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [73]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [74]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,470,470
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
1869,YAL068C,0.0,0.0
61,YAL067C,0.0,0.0
60,YAL066W,0.0,0.0
1727,YAL065C,0.0,0.0
57,YAL062W,0.0,0.0


# Print out

In [75]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [76]:
from IO.save_data_to_db3 import *

In [77]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 11907266...
Inserting the new data...


100%|██████████| 1/1 [00:07<00:00,  7.74s/it]

Updating the data_modified_on field...



