In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 26163422
paper_name = 'yimit_brown_2015' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [23]:
original_data = pd.read_excel('raw_data/TableS4.xlsx', sheet_name='Table S4', skiprows=2)

In [24]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 52 x 12


In [25]:
original_data.head()

Unnamed: 0,Standard Name,Systematic Name,Result,Confirmed in quantitative analysis,GO function: DNA Damage Response (GO:0006974),GO function: DNA Repair (GO:0006281),Constitutive RNR3 (Hendry et al),Increased Rad52 foci (Alvaro et al),dna2-1 negative genetic interactions (Budd et al),Dna2 nuclear fluorescence,Dna2-GFP focus intensity,Dna2-GFP abundance
0,BRP1,YGL007W,Phleo treated cells have fewer foci,yes,,,,,,Decreased,Decreased,Decreased
1,CKB1,YGL019W,Phleo treated cells have fewer foci,yes,,,,,,Decreased,Decreased,Decreased
2,DEG1,YFL001W,Phleo treated cells have fewer foci,yes,,,,,,,Decreased,
3,ECM32,YER176W,Phleo treated cells have fewer foci,yes,,,,,,,,
4,RAD27,YKL113C,Phleo treated cells have fewer foci,yes,yes,yes,yes,,yes,,Increased,


In [26]:
original_data['orf'] = original_data['Systematic Name'].astype(str)

In [27]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [28]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [29]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                 Standard Name  \
index_input                                                      
47                                                         NaN   
48           GO function annotations are from GO term finde...   
49           Hendry, J.A., G. Tan, J. Ou, C. Boone, and G.W...   
50           Alvaro, D., M. Lisby, and R. Rothstein, 2007 G...   
51           Budd, M.E., A.H. Tong, P. Polaczek, X. Peng, C...   

            Systematic Name Result Confirmed in quantitative analysis  \
index_input                                                             
47                      NaN    NaN                                NaN   
48                      NaN    NaN                                NaN   
49                      NaN    NaN                                NaN   
50                      NaN    NaN                                NaN   
51                      NaN    NaN                                NaN   

            GO function: 

In [30]:
original_data = original_data.loc[t,:]

In [16]:
data_switch = {'Phleo treated cells have fewer foci': -1, 
               'Untreated cells have more foci': 1}

In [31]:
original_data['data1'] = 0
original_data.loc[original_data['Result']=='Phleo treated cells have fewer foci','data1'] = -1

In [32]:
original_data['data2'] = 0
original_data.loc[original_data['Result']=='Untreated cells have more foci','data2'] = 1

In [33]:
original_data.set_index('orf', inplace=True)

In [34]:
original_data = original_data[['data1','data2']].copy()

In [35]:
original_data = original_data.groupby(original_data.index).mean()

In [36]:
original_data.shape

(47, 2)

# Load & process tested strains

In [39]:
tested = pd.read_csv('raw_data/FG_array_genes.txt', sep='\t', header=None)

In [41]:
tested['orf'] = tested[0].astype(str)
tested['orf'] = clean_orf(tested['orf'])

In [42]:
tested['orf'] = translate_sc(tested['orf'], to='orf')

In [43]:
# Make sure everything translated ok
t = looks_like_orf(tested['orf'])
print(tested.loc[~t,])

Empty DataFrame
Columns: [0, orf]
Index: []


In [44]:
tested_orfs = np.unique(tested['orf'].values)

In [45]:
missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
missing

[]

In [46]:
original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [47]:
data = original_data.copy()

In [48]:
dataset_ids = [16255, 16320]
datasets = datasets.reindex(index=dataset_ids)

In [49]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [50]:
data.head()

dataset_id,16255,16320
data_type,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2
YAL002W,0,0
YAL004W,0,0
YAL005C,0,0
YAL007C,0,0
YAL008W,0,0


## Subset to the genes currently in SGD

In [51]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [52]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16255,16320
Unnamed: 0_level_1,data_type,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,0,0
1863,YAL004W,0,0
4,YAL005C,0,0
5,YAL007C,0,0
6,YAL008W,0,0


# Normalize

In [53]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [54]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [55]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16255,16320,16255,16320
Unnamed: 0_level_1,data_type,value,value,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2,YAL002W,0,0,0.0,0.0
1863,YAL004W,0,0,0.0,0.0
4,YAL005C,0,0,0.0,0.0
5,YAL007C,0,0,0.0,0.0
6,YAL008W,0,0,0.0,0.0


# Print out

In [56]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [57]:
from IO.save_data_to_db3 import *

In [58]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/2 [00:00<?, ?it/s]

Deleting all datasets for PMID 26163422...
Inserting the new data...


100%|██████████| 2/2 [00:14<00:00,  7.41s/it]

Updating the data_modified_on field...



