In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 31511699
paper_name = 'puddu_jackson_2019' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [7]:
original_data = pd.read_excel('raw_data/41586_2019_1549_MOESM3_ESM.xlsx', 
                              sheet_name='SupplementaryTable3', skiprows=33)

In [8]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 8843 x 31


In [9]:
original_data.head()

Unnamed: 0,#SDname,rDNA,CUP1,mitochondria,2-micron,Ty1,Ty2,Ty3,Ty4,Ty5,...,chr09,chr10,chr11,chr12,chr13,chr14,chr15,chr16,ANtot,GCR
0,SD0863b,117.198697,14.010993,15.854368,48.018613,37.807161,11.295758,1.979011,3.096798,1.220737,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0
1,SD0863b2,109.893822,14.881877,15.36376,48.930359,37.800215,11.488143,1.99378,3.082327,1.189029,...,2.0,2.0,2.0,2.237545,2.0,2.0,2.0,2.0,0.237545,0
2,SD0864b,105.282907,17.908271,13.775383,38.510708,37.206717,10.976261,1.922398,3.096597,1.188149,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0
3,SD0864b2,102.575654,17.843478,14.135183,55.571128,37.723708,11.114128,1.869209,3.067437,1.092988,...,2.0,2.0,2.0,2.23813,2.0,2.0,2.0,2.0,0.23813,0
4,SD0865b,145.684801,13.92608,15.696255,40.318316,38.183292,10.825902,1.881699,3.011954,1.094776,...,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,1.226359,0


In [10]:
hit_strains_ids = original_data.iloc[:,12]

In [16]:
# Extract gene names
gene_names = [x.split('_')[1] for x in hit_strains_ids.values]

In [18]:
original_data['genes'] = gene_names
original_data['genes'] = original_data['genes'].astype(str)

In [19]:
# Eliminate all white spaces & capitalize
original_data['genes'] = clean_genename(original_data['genes'])

In [20]:
# Translate to ORFs 
original_data['orfs'] = translate_sc(original_data['genes'], to='orf')

In [24]:
# Fix a few problems manually
rename_map = {'FLO8': 'YER109C','AAD6':'YFL056C','SDL1':'YIL167W','HXT12':'YIL170W','SDC25':'YLL016W','CRS5':'YOR031W'}
original_data['orfs'] = original_data['orfs'].apply(lambda x: rename_map[x] if x in rename_map.keys() else x)

In [25]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orfs'])
print(original_data.loc[~t,])

              #SDname        rDNA       CUP1  mitochondria   2-micron  \
index_input                                                             
8835          SD5624b  109.306122  13.336735     17.040816  62.183673   
8836         SD5624b2  118.854369  14.970874     18.543689  51.378641   
8837          SD5625b  137.339130  13.808696     20.295652  43.043478   
8838         SD5625b2  127.651376  14.302752     18.807339  50.073394   
8839          SD5626b  128.495238  13.485714     17.771429  42.190476   
8840         SD5626b2  130.767442  14.000000     16.813953  32.674419   
8841          SD5627b  124.340426  14.159574     15.042553  55.553191   
8842         SD5627b2  123.169811  14.207547     15.716981  58.283019   

                   Ty1        Ty2       Ty3       Ty4       Ty5  ...  chr11  \
index_input                                                      ...          
8835         37.469388  11.387755  1.948980  3.071429  1.071429  ...    2.0   
8836         37.067961  11.59223

In [26]:
original_data = original_data.loc[t,:]

In [27]:
original_data.set_index('orfs', inplace=True)
original_data.index.name='orf'

In [39]:
columns = np.concatenate((original_data.columns.values[1:11], original_data.columns.values[13:29], original_data.columns.values[[30]]))

In [42]:
original_data = original_data.loc[:,columns]

In [43]:
original_data = original_data.groupby(original_data.index).mean()

In [44]:
original_data.shape

(4413, 27)

# Prepare the final dataset

In [46]:
data = original_data.copy()

In [48]:
dataset_ids = np.arange(16322,16349)
datasets = datasets.reindex(index=dataset_ids)

In [51]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [52]:
data.head()

dataset_id,16322,16323,16324,16325,16326,16327,16328,16329,16330,16331,...,16339,16340,16341,16342,16343,16344,16345,16346,16347,16348
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL002W,123.213583,14.362178,17.896135,52.275313,37.582148,12.248659,1.947474,3.041457,1.030599,654.269154,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0
YAL004W,118.305256,14.500185,15.027299,44.680494,37.446851,11.531497,1.999566,2.975629,1.052834,723.29201,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0
YAL005C,102.911209,14.489285,18.98023,70.004082,37.187783,11.949034,1.916653,3.04669,1.096374,719.104996,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0
YAL007C,123.824339,9.190082,17.782727,43.681434,37.073962,11.129482,1.971632,3.081317,1.152643,740.19005,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0
YAL008W,131.514271,14.000381,14.497716,33.580508,37.119337,11.203043,1.986273,3.008837,1.048206,717.387162,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0


## Subset to the genes currently in SGD

In [53]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 18


In [54]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

In [55]:
data.head()

Unnamed: 0_level_0,dataset_id,16322,16323,16324,16325,16326,16327,16328,16329,16330,16331,...,16339,16340,16341,16342,16343,16344,16345,16346,16347,16348
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,123.213583,14.362178,17.896135,52.275313,37.582148,12.248659,1.947474,3.041457,1.030599,654.269154,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0
1863,YAL004W,118.305256,14.500185,15.027299,44.680494,37.446851,11.531497,1.999566,2.975629,1.052834,723.29201,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0
4,YAL005C,102.911209,14.489285,18.98023,70.004082,37.187783,11.949034,1.916653,3.04669,1.096374,719.104996,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0
5,YAL007C,123.824339,9.190082,17.782727,43.681434,37.073962,11.129482,1.971632,3.081317,1.152643,740.19005,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0
6,YAL008W,131.514271,14.000381,14.497716,33.580508,37.119337,11.203043,1.986273,3.008837,1.048206,717.387162,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0


# Normalize

In [56]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [57]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [58]:
data_norm[data.isnull()] = np.nan

In [59]:
data_all = data.join(data_norm)

In [60]:
data_all.head()


Unnamed: 0_level_0,dataset_id,16322,16323,16324,16325,16326,16327,16328,16329,16330,16331,...,16339,16340,16341,16342,16343,16344,16345,16346,16347,16348
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,123.213583,14.362178,17.896135,52.275313,37.582148,12.248659,1.947474,3.041457,1.030599,654.269154,...,0.0,-0.076542,0.230234,0.085022,-0.095853,0.122478,0.0,0.059274,-0.112933,0.0
1863,YAL004W,118.305256,14.500185,15.027299,44.680494,37.446851,11.531497,1.999566,2.975629,1.052834,723.29201,...,0.0,-0.076542,0.230234,0.085022,-0.095853,0.122478,0.0,0.059274,-0.112933,0.0
4,YAL005C,102.911209,14.489285,18.98023,70.004082,37.187783,11.949034,1.916653,3.04669,1.096374,719.104996,...,0.0,-0.076542,0.230234,0.085022,-0.095853,0.122478,0.0,0.059274,-0.112933,0.0
5,YAL007C,123.824339,9.190082,17.782727,43.681434,37.073962,11.129482,1.971632,3.081317,1.152643,740.19005,...,0.0,-0.076542,0.230234,0.085022,-0.095853,0.122478,0.0,0.059274,-0.112933,0.0
6,YAL008W,131.514271,14.000381,14.497716,33.580508,37.119337,11.203043,1.986273,3.008837,1.048206,717.387162,...,0.0,-0.076542,0.230234,0.085022,-0.095853,0.122478,0.0,0.059274,-0.112933,0.0


# Print out

In [61]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [62]:
from IO.save_data_to_db3 import *

In [63]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 31511699...


  0%|          | 0/27 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 27/27 [02:49<00:00,  6.29s/it]

Updating the data_modified_on field...



