In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 34550356
paper_name = 'zhou_foijer_2021' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [111]:
sheets = ['Screen1-YKO raw data','Screen2-YKO raw data']

In [112]:
original_data = []
for s in sheets:
    original_data1 = pd.read_excel('raw_data/FileS1.xls', sheet_name=s, skiprows=16)
    print('Original data dimensions: %d x %d' % (original_data1.shape))
    original_data1['orf'] = original_data1['ID Column'].astype(str)
    # Eliminate all white spaces & capitalize
    original_data1['orf'] = clean_orf(original_data1['orf'])
    # Translate to ORFs 
    original_data1['orf'] = translate_sc(original_data1['orf'], to='orf')
    # Make sure everything translated ok
    t = looks_like_orf(original_data1['orf'])
    print(original_data1.loc[~t,])
    original_data1 = original_data1.loc[t,]
    
    t = original_data1['Normalized Growth Ratio (Comparer::Exp)'].str.contains('excluded')
    original_data1 = original_data1.loc[~t,]
    
    original_data1['data'] = 1/pd.to_numeric(original_data1['Growth Ratio (Comparer / Exp)'], errors='coerce')
    original_data1 = pd.pivot_table(original_data1, index='orf', columns='Query', values='data')
    original_data1 = original_data1.groupby(original_data1.index).mean()
    print(original_data1.shape)
    
    original_data.append(original_data1)

Original data dimensions: 10752 x 30
               Query  Condition Plate # Row  Column   P-Value  Z-Score  \
index_input                                                              
17              MELK        NaN     [1]   B       2  0.733240  0.34083   
33              MELK        NaN     [1]   B       3  0.968280 -0.03976   
46              MELK        NaN     [1]   O       3  0.968280 -0.03976   
47              MELK        NaN     [1]   P       3  0.968280 -0.03976   
62              MELK        NaN     [1]   O       4       NaN      NaN   
...              ...        ...     ...  ..     ...       ...      ...   
10431        MELK_kd        NaN     [9]   P       4       NaN      NaN   
10645        MELK_kd        NaN     [9]   F      18  0.842400 -0.19882   
10661        MELK_kd        NaN     [9]   F      19       NaN      NaN   
10708        MELK_kd        NaN     [9]   E      22  0.842400 -0.19882   
10724        MELK_kd        NaN     [9]   E      23  0.000007  4.47953   



In [113]:
sheets = ['Screen3-MELK_YKO raw data','Screen3-MELK_KD_YKO raw data']
data_cols = ['MELK_YKO3_size.mean.norm_div_control','MELK_KD_YKO3_size.mean.norm_div_control']
for ix_s, s in enumerate(sheets):
    original_data2 = pd.read_excel('raw_data/FileS1.xls', sheet_name=s)
    print('Original data dimensions: %d x %d' % (original_data2.shape))
    original_data2['orf'] = original_data2['ORF'].astype(str)
    # Eliminate all white spaces & capitalize
    original_data2['orf'] = clean_orf(original_data2['orf'])
    # Translate to ORFs 
    original_data2['orf'] = translate_sc(original_data2['orf'], to='orf')
    # Make sure everything translated ok
    t = looks_like_orf(original_data2['orf'])
    print(original_data2.loc[~t,])
    
    original_data2 = original_data2.loc[t,]
    original_data2['data'] = pd.to_numeric(original_data2[data_cols[ix_s]], errors='coerce')
    original_data2.set_index('orf', inplace=True)
    original_data2 = original_data2[['data']]
    original_data2 = original_data2.groupby(original_data2.index).mean()
    print(original_data2.shape)
    
    original_data2.columns = [s]
    
    original_data.append(original_data2)

Original data dimensions: 5376 x 30
            scan_cond  plate row  column  norm.rep1  norm.rep2  norm.rep3  \
index_input                                                                 
5                none      1   A       6        NaN        NaN        NaN   
6                none      1   A       7        NaN        NaN        NaN   
25               none      1   B       2        NaN        NaN        NaN   
26               none      1   B       3        NaN        NaN        NaN   
273              none      1   L      10        NaN        NaN        NaN   
...               ...    ...  ..     ...        ...        ...        ...   
5370             none     14   P      19        NaN        NaN        NaN   
5372             none     14   P      21        NaN        NaN        NaN   
5373             none     14   P      22        NaN        NaN        NaN   
5374             none     14   P      23        NaN        NaN        NaN   
5375             none     14   P      24

In [114]:
len(original_data)

4

In [162]:
original_data_all = pd.concat(original_data, axis=1)

In [163]:
original_data_all.head()

Unnamed: 0,MELK,MELK_kd,MELK_KD_YKO,MELK_YKO,Screen3-MELK_YKO raw data,Screen3-MELK_KD_YKO raw data
YAL002W,0.917431,0.884956,2.083333,2.380952,0.693835,0.729166
YAL004W,0.943396,0.892857,0.909091,1.098901,0.973479,0.84356
YAL005C,0.819672,0.862069,1.041667,1.098901,0.706074,0.330806
YAL007C,1.123596,1.010101,0.970874,1.282051,0.504571,1.110399
YAL008W,0.970874,1.333333,1.724138,1.388889,1.472729,1.466907


In [164]:
original_data_all.columns = ['MELK','MELK_kd','MELK_kd','MELK','MELK','MELK_kd']

In [165]:
original_data_all = original_data_all.T
original_data_all = original_data_all.groupby(original_data_all.index.values).mean()
original_data_all = original_data_all.T

In [166]:
original_data_all.head()

Unnamed: 0,MELK,MELK_kd
YAL002W,1.330739,1.232485
YAL004W,1.005259,0.881836
YAL005C,0.874882,0.744847
YAL007C,0.970073,1.030458
YAL008W,1.277497,1.508126


In [167]:
original_data_all.shape

(4899, 2)

# Prepare the final dataset

In [168]:
data = original_data_all.copy()

In [169]:
data.index.names = ['orf']

In [170]:
dataset_ids = [22059,22060]
datasets = datasets.reindex(index=dataset_ids)

In [171]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [172]:
data.head()

dataset_id,22059,22060
data_type,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2
YAL002W,1.330739,1.232485
YAL004W,1.005259,0.881836
YAL005C,0.874882,0.744847
YAL007C,0.970073,1.030458
YAL008W,1.277497,1.508126


## Subset to the genes currently in SGD

In [173]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [174]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,22059,22060
Unnamed: 0_level_1,data_type,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,1.330739,1.232485
1863,YAL004W,1.005259,0.881836
4,YAL005C,0.874882,0.744847
5,YAL007C,0.970073,1.030458
6,YAL008W,1.277497,1.508126


# Normalize

In [175]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [176]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [177]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,22059,22060,22059,22060
Unnamed: 0_level_1,data_type,value,value,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2,YAL002W,1.330739,1.232485,0.050444,0.041747
1863,YAL004W,1.005259,0.881836,-0.012234,-0.027207
4,YAL005C,0.874882,0.744847,-0.037341,-0.054146
5,YAL007C,0.970073,1.030458,-0.01901,0.002019
6,YAL008W,1.277497,1.508126,0.040191,0.095952


In [178]:
data_all.loc[(slice(None),'YOL025W'),:]

Unnamed: 0_level_0,dataset_id,22059,22060,22059,22060
Unnamed: 0_level_1,data_type,value,value,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
5074,YOL025W,0.654502,0.587189,-0.07978,-0.085149


# Print out

In [179]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [180]:
from IO.save_data_to_db3 import *

In [181]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/2 [00:00<?, ?it/s]

Deleting all datasets for PMID 34550356...
Inserting the new data...


100%|██████████| 2/2 [00:12<00:00,  6.27s/it]

Updating the data_modified_on field...



