In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 27088128
paper_name = 'koselny_krysan_2016' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [23]:
original_data1 = pd.read_excel("raw_data/deliv_AltComparisons - Lacey's data - Sorted by Pvalue.xlsx", sheet_name='DowntagFC', skiprows=1)
original_data2 = pd.read_excel("raw_data/deliv_AltComparisons - Lacey's data - Sorted by Pvalue.xlsx", sheet_name='UptagFC', skiprows=1)

In [24]:
print('Original data dimensions: %d x %d' % (original_data1.shape))
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 6293 x 12
Original data dimensions: 6446 x 12


In [25]:
original_data1.head()

Unnamed: 0,orf_name,A,B,C,D,All 2's vs. All 0's,Unnamed: 6,Unnamed: 7,<-1,0.6307692307692307,Unnamed: 10,Unnamed: 11
0,YPR188C,-0.489753,-0.288484,-0.491991,-0.42718,1.4e-05,,,,,,
1,YCR092C,0.580636,0.915609,0.866217,0.581488,4.4e-05,,,,,,
2,YDR481C,-0.584633,-0.783874,-1.16579,-0.787227,6.6e-05,,,,,,
3,YLR206W,-0.755624,-0.683381,-0.759622,-0.624486,7e-05,,,,,,
4,YCR025C,-3.75288,-8.58357,-2.42621,ABSENT,0.000105,,YCR025C,,Dubious,Dubious open reading frame; unlikely to encode...,


In [26]:
original_data1['orf'] = original_data1['orf_name'].astype(str)
original_data2['orf'] = original_data2['orf_name'].astype(str)

In [27]:
# Eliminate all white spaces & capitalize
original_data1['orf'] = clean_orf(original_data1['orf'])
original_data2['orf'] = clean_orf(original_data2['orf'])

In [28]:
# Translate to ORFs 
original_data1['orf'] = translate_sc(original_data1['orf'], to='orf')
original_data2['orf'] = translate_sc(original_data2['orf'], to='orf')

In [29]:
# Make sure everything translated ok
t = looks_like_orf(original_data1['orf'])
print(original_data1.loc[~t,])

               orf_name   A                   B       C   D  \
index_input                                                   
5045         YBR160W_AS  +0  +0.185634523863225  ABSENT  +0   

             All 2's vs. All 0's  Unnamed: 6 Unnamed: 7  <-1  \
index_input                                                    
5045                     0.36474         NaN        NaN  NaN   

            0.6307692307692307 Unnamed: 10  Unnamed: 11        orf  
index_input                                                         
5045                       NaN         NaN          NaN  YBR160WAS  


In [30]:
# Make sure everything translated ok
t = looks_like_orf(original_data2['orf'])
print(original_data2.loc[~t,])

               orf_name                   A       B                    C  \
index_input                                                                
6123         YBR160W_AS  +0.189381206377223 -1.5736  +0.0870193951438478   

                  D  All 2's vs. All 0's  Unnamed: 6 Unnamed: 7   <1  \
index_input                                                            
6123         ABSENT             0.465656         NaN        NaN  NaN   

            0.43373493975903615 Unnamed: 10  Unnamed: 11        orf  
index_input                                                          
6123                        NaN         NaN          NaN  YBR160WAS  


In [31]:
original_data1.loc[original_data1['orf'] == 'YBR160W_AS','orf'] = 'YBR160W'
original_data2.loc[original_data2['orf'] == 'YBR160W_AS','orf'] = 'YBR160W'

In [32]:
for c in ['A','B','C','D']:
    original_data1[c] = pd.to_numeric(original_data1[c], errors='coerce')
    original_data2[c] = pd.to_numeric(original_data2[c], errors='coerce')

In [33]:
original_data1 = original_data1.groupby(original_data1['orf']).mean()
original_data2 = original_data2.groupby(original_data2['orf']).mean()

In [34]:
print(original_data1.shape)
print(original_data2.shape)

(5716, 7)
(5899, 7)


In [37]:
original_data = original_data1[['A','B','C','D']].join(original_data2[['A','B','C','D']], lsuffix='_down', rsuffix='_up')

In [38]:
original_data.head()

Unnamed: 0_level_0,A_down,B_down,C_down,D_down,A_up,B_up,C_up,D_up
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
YAL012W,-1.214814,0.44709,-0.824963,0.230506,-1.146863,0.336091,-1.880935,1.248505
YAL016W,-0.218436,0.276031,-0.560689,,-0.526832,0.067229,-1.16875,-3.836371
YAL024C,-0.526201,-0.077825,-0.135837,0.127956,-0.405916,-0.221996,0.212038,0.182649
YAL047C,0.264278,0.12609,0.534486,-0.644725,0.250593,0.197676,0.305322,1.533319
YAL054C,-0.297919,0.222354,0.091307,-0.392657,0.162762,-0.387185,-0.078196,-0.077585


In [39]:
original_data['data'] = original_data.mean(axis=1)

In [41]:
original_data.head()

Unnamed: 0_level_0,A_down,B_down,C_down,D_down,A_up,B_up,C_up,D_up,data
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
YAL012W,-1.214814,0.44709,-0.824963,0.230506,-1.146863,0.336091,-1.880935,1.248505,-0.350673
YAL016W,-0.218436,0.276031,-0.560689,,-0.526832,0.067229,-1.16875,-3.836371,-0.852545
YAL024C,-0.526201,-0.077825,-0.135837,0.127956,-0.405916,-0.221996,0.212038,0.182649,-0.105642
YAL047C,0.264278,0.12609,0.534486,-0.644725,0.250593,0.197676,0.305322,1.533319,0.32088
YAL054C,-0.297919,0.222354,0.091307,-0.392657,0.162762,-0.387185,-0.078196,-0.077585,-0.09464


# Prepare the final dataset

In [54]:
data = original_data[['data']].copy()

In [55]:
dataset_ids = [22077]
datasets = datasets.reindex(index=dataset_ids)

In [56]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [57]:
data.head()

dataset_id,22077
data_type,value
orf,Unnamed: 1_level_2
YAL012W,-0.350673
YAL016W,-0.852545
YAL024C,-0.105642
YAL047C,0.32088
YAL054C,-0.09464


## Subset to the genes currently in SGD

In [58]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [59]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,22077
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
10,YAL012W,-0.350673
14,YAL016W,-0.852545
22,YAL024C,-0.105642
45,YAL047C,0.32088
50,YAL054C,-0.09464


# Normalize

In [60]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [61]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [62]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,22077,22077
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
10,YAL012W,-0.350673,-0.347129
14,YAL016W,-0.852545,-1.020353
22,YAL024C,-0.105642,-0.018438
45,YAL047C,0.32088,0.553708
50,YAL054C,-0.09464,-0.00368


# Print out

In [63]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [64]:
from IO.save_data_to_db3 import *

In [65]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 27088128...
Inserting the new data...


100%|██████████| 1/1 [00:07<00:00,  7.48s/it]

Updating the data_modified_on field...



