In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 20065090
paper_name = 'vizeacoumar_andrews_2010' 

In [40]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [41]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [21]:
original_data = pd.read_excel('raw_data/JCB_200909013_TableS3.xlsx', sheet_name='Single deletion screen', skiprows=1)

In [22]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4292 x 91


In [23]:
original_data.head()

Unnamed: 0,Open reading frame,Unbud cell Mean Area,Unbud cell Mean Perimeter,Unbud cell Mean Length,Unbud cell Mean Breadth,Unbud cell Mean ShapeFactor,Unbud cell Mean Ell. Form Factor,Unbud cell Mean Equiv. Prolate Vol.,Unbud cell Mean Oblate Vol.,Medium bud Mean Mother Area,...,Small bud Mean Mother Equiv. Oblate Vol.,Small bud Mean Daughter Area,Small bud Mean Daughter Perimeter,Small bud Mean Daughter Length,Small bud Mean Daughter Breadth,Small bud Mean Daughter Shape Factor,Small bud Mean Daughter Ell. Form Factor,Small bud Mean Daughter Equiv. Prolate Vol.,Small bud Mean Daughter Equiv. Oblate Vol.,Small bud Mean budneck width
0,YAL002W,16.4598,14.5957,5.1599,4.4408,0.9444,1.1684,66.8069,57.6282,20.716,...,78.7655,5.328,8.364,3.2135,2.5105,0.9535,1.278,13.8275,10.7595,4.1525
1,YAL004W,15.0917,14.082,5.0177,4.2465,0.942,1.186,58.5257,49.5503,20.3515,...,69.3222,3.7117,6.5693,2.6632,1.9443,0.8687,1.2565,9.0859,6.5652,1.5404
2,YAL005C,17.5131,15.1479,5.37,4.6191,0.9465,1.1633,72.9691,62.4873,25.9946,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,YAL007C,17.1216,14.9912,5.2408,4.6086,0.9469,1.1364,68.9804,60.2137,21.1542,...,84.2858,4.6216,7.4748,2.8422,2.4532,0.9918,1.1564,10.5986,9.096,1.6706
4,YAL008W,16.4443,14.6618,5.1068,4.537,0.95,1.129,64.5065,57.2782,21.8016,...,75.207,1.795,3.5315,1.4525,1.011,0.452,0.7185,4.4675,3.109,3.334


In [24]:
original_data['orf'] = original_data['Open reading frame'].astype(str)

In [25]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [26]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [27]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [Open reading frame, Unbud cell Mean Area, Unbud cell Mean Perimeter, Unbud cell Mean Length, Unbud cell Mean Breadth, Unbud cell Mean ShapeFactor, Unbud cell Mean Ell. Form Factor, Unbud cell Mean Equiv. Prolate Vol., Unbud cell Mean Oblate Vol., Medium bud Mean Mother Area, Medium bud Mean Mother Perimeter, Medium bud Mean Mother Length, Medium bud Mean Mother Breadth, Medium bud Mean Mother Shape Factor, Medium bud Mean Mother Ell. Form Factor, Medium bud Mean Mother Equiv. Prolate Vol., Medium bud Mean Mother Equiv. Oblate Vol., Medium bud Mean Daughter Area, Medium bud Mean Daughter Perimeter, Medium bud Mean Daughter Length, Medium bud Mean Daughter Breadth, Medium bud Mean Daughter Shape Factor, Medium bud Mean Daughter Ell. Form Factor, Medium bud Mean Daughter Equiv. Prolate Vol., Medium bud Mean Daughter Equiv. Oblate Vol., Medium bud Mean budneck width, Large bud Mean Mother Area, Large bud Mean Mother Perimeter, Large bud Mean Mother Length, Large b

In [28]:
original_data.set_index('orf', inplace=True)

In [29]:
original_data.drop(columns=['Open reading frame'], inplace=True)

In [30]:
original_data = original_data.apply(pd.to_numeric, axis=1, errors='coerce')

In [31]:
# Set all zeros to NaNs (distribution analysis suggests that zeros are
# technical artefacts, not real biological phenotypes)
original_data[original_data==0] = np.nan

In [32]:
original_data = original_data.groupby(original_data.index).mean()

In [33]:
original_data.shape

(4267, 90)

# Prepare the final dataset

In [42]:
data = original_data.copy()

In [43]:
dataset_ids = np.arange(16036, 16126)
datasets = datasets.reindex(index=dataset_ids)

In [44]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [45]:
data.head()

dataset_id,16036,16037,16038,16039,16040,16041,16042,16043,16044,16045,...,16116,16117,16118,16119,16120,16121,16122,16123,16124,16125
data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
YAL002W,16.4598,14.5957,5.1599,4.4408,0.9444,1.1684,66.8069,57.6282,20.716,16.6978,...,78.7655,5.328,8.364,3.2135,2.5105,0.9535,1.278,13.8275,10.7595,4.1525
YAL004W,15.0917,14.082,5.0177,4.2465,0.942,1.186,58.5257,49.5503,20.3515,16.6896,...,69.3222,3.7117,6.5693,2.6632,1.9443,0.8687,1.2565,9.0859,6.5652,1.5404
YAL005C,17.5131,15.1479,5.37,4.6191,0.9465,1.1633,72.9691,62.4873,25.9946,18.6444,...,,,,,,,,,,
YAL007C,17.1216,14.9912,5.2408,4.6086,0.9469,1.1364,68.9804,60.2137,21.1542,16.9962,...,84.2858,4.6216,7.4748,2.8422,2.4532,0.9918,1.1564,10.5986,9.096,1.6706
YAL008W,16.4443,14.6618,5.1068,4.537,0.95,1.129,64.5065,57.2782,21.8016,17.218,...,75.207,1.795,3.5315,1.4525,1.011,0.452,0.7185,4.4675,3.109,3.334


## Subset to the genes currently in SGD

In [46]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [47]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16036,16037,16038,16039,16040,16041,16042,16043,16044,16045,...,16116,16117,16118,16119,16120,16121,16122,16123,16124,16125
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,16.4598,14.5957,5.1599,4.4408,0.9444,1.1684,66.8069,57.6282,20.716,16.6978,...,78.7655,5.328,8.364,3.2135,2.5105,0.9535,1.278,13.8275,10.7595,4.1525
1863,YAL004W,15.0917,14.082,5.0177,4.2465,0.942,1.186,58.5257,49.5503,20.3515,16.6896,...,69.3222,3.7117,6.5693,2.6632,1.9443,0.8687,1.2565,9.0859,6.5652,1.5404
4,YAL005C,17.5131,15.1479,5.37,4.6191,0.9465,1.1633,72.9691,62.4873,25.9946,18.6444,...,,,,,,,,,,
5,YAL007C,17.1216,14.9912,5.2408,4.6086,0.9469,1.1364,68.9804,60.2137,21.1542,16.9962,...,84.2858,4.6216,7.4748,2.8422,2.4532,0.9918,1.1564,10.5986,9.096,1.6706
6,YAL008W,16.4443,14.6618,5.1068,4.537,0.95,1.129,64.5065,57.2782,21.8016,17.218,...,75.207,1.795,3.5315,1.4525,1.011,0.452,0.7185,4.4675,3.109,3.334


# Normalize

In [48]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [49]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [50]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16036,16037,16038,16039,16040,16041,16042,16043,16044,16045,...,16116,16117,16118,16119,16120,16121,16122,16123,16124,16125
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,YAL002W,16.4598,14.5957,5.1599,4.4408,0.9444,1.1684,66.8069,57.6282,20.716,16.6978,...,-0.107933,0.859506,0.378298,0.383625,0.414772,-0.054623,0.391192,0.940514,0.971374,2.913516
1863,YAL004W,15.0917,14.082,5.0177,4.2465,0.942,1.186,58.5257,49.5503,20.3515,16.6896,...,-0.40812,-0.387536,-0.551745,-0.371177,-0.564903,-0.409472,0.314809,-0.195399,-0.221206,-0.313453
4,YAL005C,17.5131,15.1479,5.37,4.6191,0.9465,1.1633,72.9691,62.4873,25.9946,18.6444,...,,,,,,,,,,
5,YAL007C,17.1216,14.9912,5.2408,4.6086,0.9469,1.1364,68.9804,60.2137,21.1542,16.9962,...,0.067549,0.31449,-0.0825,-0.125657,0.315628,0.105645,-0.040817,0.166989,0.498385,-0.152605
6,YAL008W,16.4443,14.6618,5.1068,4.537,0.95,1.129,64.5065,57.2782,21.8016,17.218,...,-0.221052,-1.866349,-2.125981,-2.031796,-2.179759,-2.153168,-1.596544,-1.301797,-1.20392,1.902347


# Print out

In [51]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [52]:
from IO.save_data_to_db3 import *

In [53]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 20065090...


  0%|          | 0/90 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 90/90 [09:26<00:00,  6.29s/it]

Updating the data_modified_on field...



