In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 31734159
paper_name = 'kuroda_avalos_2019' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/1-s2.0-S2405471219303825-mmc2.xlsx', sheet_name='1st screen', skiprows=4)

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 2684 x 22


In [7]:
original_data.head()

Unnamed: 0.1,Unnamed: 0,0% Isobutanol,1.4% Isobutanol,Tolerance factor,Unnamed: 4,Unnamed: 5,0% Isobutanol.1,1.4% Isobutanol.1,Tolerance factor.1,Unnamed: 9,...,1.4% Isobutanol.2,Tolerance factor.2,Unnamed: 14,0% Isobutanol.3,1.4% Isobutanol.3,Tolerance factor.3,Unnamed: 18,Unnamed: 19,0% Isobutanol.4,1.4% Isobutanol.4
0,YBR010W,5.146275,1.02758,0.199674,,YDR516C,4.409099,3.524859,0.799451,,...,4.158719,2.656361,,6.2629,2.3555,0.376104,,YDL081C,0.462597,0.034439
1,YBR053C,6.246456,1.246313,0.199523,,YKL213C,4.356975,3.482974,0.799402,,...,3.8013,2.180459,,,,,,YJR055W,0.45422,0.029785
2,YGR004W,3.7892,0.755792,0.19946,,YFR010W,4.869834,3.891586,0.799121,,...,4.498453,2.052229,,,,,,YBR168W,0.395581,0.0
3,YER083C,4.810264,0.958702,0.199303,,YNL022C,5.309162,4.236905,0.798036,,...,3.635622,1.953977,,,,,,YDL082W,0.379758,0.040023
4,YJL168C,5.970015,1.187674,0.19894,,YPL119C,5.312885,4.239697,0.798003,,...,4.27972,1.943364,,,,,,YHR134W,0.361142,0.14334


In [8]:
hit_orfs = np.hstack([original_data.iloc[:,c].values for c in [0,5,10,19]])

In [9]:
hit_orfs.shape

(10736,)

In [10]:
hit_orfs = hit_orfs.astype(str)

In [11]:
hit_data1 = np.hstack([original_data.iloc[:,c].values for c in [1,6,11,20]])
hit_data2 = np.hstack([original_data.iloc[:,c].values for c in [3,8,13]]+[np.zeros((original_data.shape[0]))+np.nan])

In [12]:
hit_data1.shape

(10736,)

In [13]:
hit_data2.shape

(10736,)

In [14]:
original_data2 = pd.DataFrame(index=hit_orfs, columns=['0_iso','tf'], data=np.vstack((hit_data1, hit_data2)).T)

In [15]:
original_data2['orfs'] = original_data2.index.values.astype(str)

In [16]:
# Eliminate all white spaces & capitalize
original_data2['orfs'] = clean_orf(original_data2['orfs'])

In [17]:
typo_fix = {'VER093C-A': 'YER093C-A','YMROB4W':'YMR084W','YARD02C-A':'YAR002C-A','YFLOOIW':'YFL001W',
           'YMLOIOC-B':'YML010C-B','VER091C':'YER091C','VCR086W':'YCR086W','YNLO15W':'YNL015W',
           'VER064C':'YER064C','VBR285W':'YBR285W','YMR08IC':'YMR081C','YJRIOOC':'YJR100C','YGR2B8W':'YGR288W',
           'YPROT4C':'YPR014C','VCR087W':'YCR087W','YARD50W':'YAR050W','VCL075W':'YCL075W','YJR09IC':'YJR091C',
           'YJR044C6':'YJR044C','VCR031C':'YCR031C'}

In [18]:
original_data2['orfs'] = original_data2['orfs'].apply(lambda x: typo_fix[x] if x in list(typo_fix.keys()) else x)

In [19]:
original_data2.shape

(10736, 3)

In [20]:
original_data2 = original_data2.groupby('orfs').mean().reset_index()

In [21]:
original_data2.shape

(4407, 3)

In [22]:
original_data2.head()

Unnamed: 0,orfs,0_iso,tf
0,NAN,,
1,NDTOLERANCEFACTORWASNOTDETERMINEDFORSTRAINSTHA...,,
2,YAL012W,6.403757,0.266424
3,YAL016W,6.279964,0.457537
4,YAL024C,6.962225,0.291176


In [23]:
# Translate to ORFs 
original_data2['orfs'] = translate_sc(original_data2['orfs'], to='orf')

In [24]:
# Make sure everything translated ok
t = looks_like_orf(original_data2['orfs'])
print(original_data2.loc[~t,])

                                                          orfs     0_iso  \
index_input                                                                
0                                                          NAN       NaN   
1            NDTOLERANCEFACTORWASNOTDETERMINEDFORSTRAINSTHA...       NaN   
2960                                                   YLR25TW  4.170819   

                   tf  
index_input            
0                 NaN  
1                 NaN  
2960         0.906494  


In [25]:
original_data2 = original_data2.loc[t,:]

In [26]:
original_data2.shape

(4404, 3)

In [27]:
original_data2 = original_data2.groupby('orfs').mean()

In [28]:
original_data2.head()

Unnamed: 0_level_0,0_iso,tf
orfs,Unnamed: 1_level_1,Unnamed: 2_level_1
YAL012W,6.403757,0.266424
YAL016W,6.279964,0.457537
YAL024C,6.962225,0.291176
YAL047C,2.07936,0.0
YAL054C,6.188748,0.447737


In [29]:
original_data2.index.name='orf'

In [30]:
original_data2.shape

(4382, 2)

In [37]:
original_data2.loc['YBR010W',]

0_iso    5.146275
tf       0.199674
Name: YBR010W, dtype: float64

# Prepare the final dataset

In [38]:
data = original_data2[['0_iso','tf']].copy()

In [39]:
dataset_ids = [16414,16411]
datasets = datasets.reindex(index=dataset_ids)

In [40]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [41]:
data.head()

dataset_id,16414,16411
data_type,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2
YAL012W,6.403757,0.266424
YAL016W,6.279964,0.457537
YAL024C,6.962225,0.291176
YAL047C,2.07936,0.0
YAL054C,6.188748,0.447737


## Subset to the genes currently in SGD

In [42]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [43]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

In [44]:
data.head()

Unnamed: 0_level_0,dataset_id,16414,16411
Unnamed: 0_level_1,data_type,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
10,YAL012W,6.403757,0.266424
14,YAL016W,6.279964,0.457537
22,YAL024C,6.962225,0.291176
45,YAL047C,2.07936,0.0
50,YAL054C,6.188748,0.447737


# Normalize

In [45]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [46]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [47]:
data_norm[data.isnull()] = np.nan

In [48]:
data_all = data.join(data_norm)

In [49]:
data_all.head()


Unnamed: 0_level_0,dataset_id,16414,16411,16414,16411
Unnamed: 0_level_1,data_type,value,value,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
10,YAL012W,6.403757,0.266424,0.551292,-1.022799
14,YAL016W,6.279964,0.457537,0.478644,-0.465446
22,YAL024C,6.962225,0.291176,0.879029,-0.950613
45,YAL047C,2.07936,0.0,-1.98648,-1.799791
50,YAL054C,6.188748,0.447737,0.425114,-0.494027


# Print out

In [50]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [51]:
from IO.save_data_to_db3 import *

In [52]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/2 [00:00<?, ?it/s]

Deleting all datasets for PMID 31734159...
Inserting the new data...


100%|██████████| 2/2 [00:12<00:00,  6.16s/it]

Updating the data_modified_on field...



