In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 34645498
paper_name = 'liu_li_2021' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/Sc KO Screen all 1 - IsoBuOH vs Ctrl.xlsx', sheet_name='工作表1', skiprows=1)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 5376 x 26


In [10]:
original_data.head()

Unnamed: 0,ID Column,Plate #,Row,Column,P-Value,Z-Score,Normalized Ratio (Comparer::Exp),Calculated Log Ratio (Comparer::Exp),Average Normalized Size,Normalized Colony Size 1,...,Colony Circularity 4,Average Normalized Size.1,Normalized Colony Size 1.1,Normalized Colony Size 2.1,Normalized Colony Size 3.1,Normalized Colony Size 4.1,Colony Circularity 1.1,Colony Circularity 2.1,Colony Circularity 3.1,Colony Circularity 4.1
0,NC,[1],A,1,0.8627,-0.17294,0.10::0.10,0.0,dead,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,YLR119W,[1],B,1,0.92918,-0.08889,1.53::1.50,0.0169,1.52557,1.69,...,0.9397,1.5,1.65,1.44,1.55,1.36,0.9799,0.9629,0.9782,0.9705
2,YAL053W,[1],C,1,0.57726,-0.55739,1.35::1.46,-0.07731,1.34943,1.45,...,0.9545,1.4575,1.64,1.28,1.54,1.37,0.9737,0.9562,0.9608,0.978
3,YML084W,[1],D,1,0.76266,-0.302,1.29::1.32,-0.02595,1.28977,1.39,...,0.9113,1.3225,1.41,1.25,1.44,1.19,0.9418,0.9606,0.9629,0.9638
4,YAL036C,[1],E,1,0.93704,-0.07898,1.29::1.27,0.01889,1.29261,1.36,...,0.872,1.2675,1.26,1.2,1.43,1.18,0.9807,0.9899,0.9829,0.9433


In [21]:
original_data['orf'] = original_data['ID Column'].astype(str)

In [22]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [23]:
original_data.loc[original_data['orf']=='YLR287-A','orf'] = 'YLR287C-A'

In [24]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [26]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

            ID Column Plate # Row  Column  P-Value  Z-Score  \
index_input                                                   
0                  NC     [1]   A       1  0.86270 -0.17294   
46                 NC     [1]   O       3      NaN      NaN   
47                 NC     [1]   P       3      NaN      NaN   
48                 NC     [1]   A       4  0.86270 -0.17294   
62                 NC     [1]   O       4      NaN      NaN   
...               ...     ...  ..     ...      ...      ...   
5370               NC    [14]   K      24  0.75322 -0.31441   
5372               NC    [14]   M      24  0.75322 -0.31441   
5373               NC    [14]   N      24  0.75322 -0.31441   
5374               NC    [14]   O      24  0.75322 -0.31441   
5375               NC    [14]   P      24      NaN      NaN   

            Normalized Ratio (Comparer::Exp)  \
index_input                                    
0                                 0.10::0.10   
46              excluded-0.00::0.00-

In [28]:
original_data = original_data.loc[t,:]

In [30]:
original_data['data1'] = pd.to_numeric(original_data['Average Normalized Size'], errors='coerce')
original_data['data2'] = pd.to_numeric(original_data['Average Normalized Size.1'], errors='coerce')

In [31]:
original_data['data'] = original_data['data2'] / original_data['data1']

In [32]:
original_data.set_index('orf', inplace=True)

In [33]:
original_data = original_data[['data']].copy()

In [34]:
original_data = original_data.groupby(original_data.index).mean()

In [35]:
original_data.shape

(4903, 1)

In [36]:
original_data.head()

Unnamed: 0_level_0,data
orf,Unnamed: 1_level_1
YAL002W,1.059618
YAL004W,0.872862
YAL005C,0.988207
YAL007C,1.241954
YAL008W,1.149438


# Prepare the final dataset

In [40]:
data = original_data.copy()

In [41]:
dataset_ids = [22065]
datasets = datasets.reindex(index=dataset_ids)

In [42]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [43]:
data.head()

dataset_id,22065
data_type,value
orf,Unnamed: 1_level_2
YAL002W,1.059618
YAL004W,0.872862
YAL005C,0.988207
YAL007C,1.241954
YAL008W,1.149438


## Subset to the genes currently in SGD

In [44]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [45]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,22065
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,1.059618
1863,YAL004W,0.872862
4,YAL005C,0.988207
5,YAL007C,1.241954
6,YAL008W,1.149438


# Normalize

In [46]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [47]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [48]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,22065,22065
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,1.059618,0.188681
1863,YAL004W,0.872862,-0.600751
4,YAL005C,0.988207,-0.113178
5,YAL007C,1.241954,0.959426
6,YAL008W,1.149438,0.568356


# Print out

In [49]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [50]:
from IO.save_data_to_db3 import *

In [51]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 34645498...
Inserting the new data...


100%|██████████| 1/1 [00:06<00:00,  6.18s/it]

Updating the data_modified_on field...



