In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 22384346
paper_name = 'piggott_nislow_2011' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [11]:
original_data = pd.read_excel('raw_data/TableS3.xlsx', sheet_name='ORF sort')

In [12]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 7051 x 27


In [14]:
original_data.head()

Unnamed: 0,Gene,Name,ID,Tag,2vs1,4vs1,6vs1,8vs1,10vs1,14vs1,...,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,YAL002W,VPS8,VPS8,U,-0.841049,-7.68985,-0.141234,1.259199,-1.013566,0.640347,...,,,,,,,,,,
1,YAL004W,0,YAL004W,U,-0.294741,-1.022835,0.04994,0.16593,-0.076213,-0.115223,...,,,,,,,,,,
2,YAL005C,SSA1,SSA1,U,-1.223968,-0.898407,-0.490983,-0.038527,0.104799,-0.121057,...,,,,,,,,,,
3,YAL007C,ERP2,ERP2,U,-0.461506,0.450762,-0.414188,-0.036244,0.228332,0.136455,...,,,,,,,,,,
4,YAL008W,FUN14,FUN14,U,0.747904,-0.190254,-0.25054,-0.057264,-0.107475,-0.658144,...,,,,,,,,,,


In [15]:
original_data['orf'] = original_data['Gene'].astype(str)

In [16]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [17]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [18]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [Gene, Name, ID, Tag, 2vs1, 4vs1, 6vs1, 8vs1, 10vs1, 14vs1, Unnamed: 10, Unnamed: 11, Unnamed: 12, Unnamed: 13, Unnamed: 14, Unnamed: 15, Unnamed: 16, Unnamed: 17, Unnamed: 18, Unnamed: 19, Unnamed: 20, Unnamed: 21, Unnamed: 22, Unnamed: 23, Unnamed: 24, Unnamed: 25, Unnamed: 26, orf]
Index: []

[0 rows x 28 columns]


In [19]:
original_data.set_index('orf', inplace=True)

In [20]:
original_data = original_data[['2vs1','4vs1','6vs1','8vs1','10vs1','14vs1']].copy()

In [21]:
original_data = original_data.groupby(original_data.index).mean()

In [22]:
original_data.shape

(3713, 6)

# Load Het data

In [26]:
original_data2 = pd.read_excel('raw_data/TableS6.xlsx', sheet_name='Sheet1', skiprows=2)

In [27]:
print('Original data dimensions: %d x %d' % (original_data2.shape))

Original data dimensions: 9505 x 9


In [28]:
original_data2.head()

Unnamed: 0,ORF,Name,Tag,2vs1,4vs1,6vs1,8vs1,10vs1,14vs1
0,YMR056C,AAC1,D,-0.098519,0.593711,0.501568,-0.548638,0.638022,0.038102
1,YMR056C,AAC1,U,-0.318029,0.894036,-0.029645,0.053836,0.641215,-0.459805
2,YBR085W,AAC3,U,-2.453802,-2.110001,-1.039459,-4.673432,-1.72528,-0.720205
3,YNL331C,AAD14,D,-2.168076,-1.030625,-2.732474,0.981656,-0.385349,1.946294
4,YCR107W,AAD3,U,-2.334676,2.176522,-2.072614,-3.900493,,


In [32]:
original_data2['orf'] = original_data2['ORF'].astype(str)

In [33]:
# Eliminate all white spaces & capitalize
original_data2['orf'] = clean_orf(original_data2['orf'])

In [34]:
# Translate to ORFs 
original_data2['orf'] = translate_sc(original_data2['orf'], to='orf')

In [35]:
# Make sure everything translated ok
t = looks_like_orf(original_data2['orf'])
print(original_data2.loc[~t,])

Empty DataFrame
Columns: [ORF, Name, Tag, 2vs1, 4vs1, 6vs1, 8vs1, 10vs1, 14vs1, orf]
Index: []


In [36]:
original_data2.set_index('orf', inplace=True)

In [37]:
original_data2 = original_data2[['2vs1','4vs1','6vs1','8vs1','10vs1','14vs1']].copy()

In [38]:
original_data2 = original_data2.groupby(original_data2.index).mean()

In [39]:
original_data2.shape

(5446, 6)

# Merge

In [40]:
original_data = original_data.join(original_data2, how='outer', lsuffix='_1', rsuffix='_2')

In [41]:
original_data.head()

Unnamed: 0_level_0,2vs1_1,4vs1_1,6vs1_1,8vs1_1,10vs1_1,14vs1_1,2vs1_2,4vs1_2,6vs1_2,8vs1_2,10vs1_2,14vs1_2
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
YAL001C,,,,,,,0.246153,0.024975,-0.368841,-0.133006,0.278983,-0.759745
YAL002W,-0.841049,-7.68985,-0.141234,1.259199,-1.013566,0.640347,0.996179,-0.877163,-0.871235,0.065126,1.713675,2.705321
YAL003W,,,,,,,-1.331451,1.142709,-0.122686,1.462129,0.224788,-0.668123
YAL004W,-0.294741,-1.022835,0.04994,0.16593,-0.076213,-0.115223,0.546199,-0.418754,0.175109,-0.443581,-0.157899,0.538201
YAL005C,-1.223968,-0.898407,-0.490983,-0.038527,0.104799,-0.121057,-0.346914,-0.340998,0.615853,-2.484278,-0.764022,-0.683653


In [42]:
original_data.shape

(5672, 12)

# Prepare the final dataset

In [43]:
data = original_data.copy()

In [44]:
dataset_ids = [11836, 11838, 11839, 11840, 11841, 11842, 11837, 11843, 11844, 11845, 11846, 11847]
datasets = datasets.reindex(index=dataset_ids)

In [45]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [46]:
data.head()

dataset_id,11836,11838,11839,11840,11841,11842,11837,11843,11844,11845,11846,11847
data_type,value,value,value,value,value,value,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
YAL001C,,,,,,,0.246153,0.024975,-0.368841,-0.133006,0.278983,-0.759745
YAL002W,-0.841049,-7.68985,-0.141234,1.259199,-1.013566,0.640347,0.996179,-0.877163,-0.871235,0.065126,1.713675,2.705321
YAL003W,,,,,,,-1.331451,1.142709,-0.122686,1.462129,0.224788,-0.668123
YAL004W,-0.294741,-1.022835,0.04994,0.16593,-0.076213,-0.115223,0.546199,-0.418754,0.175109,-0.443581,-0.157899,0.538201
YAL005C,-1.223968,-0.898407,-0.490983,-0.038527,0.104799,-0.121057,-0.346914,-0.340998,0.615853,-2.484278,-0.764022,-0.683653


## Subset to the genes currently in SGD

In [47]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [48]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,11836,11838,11839,11840,11841,11842,11837,11843,11844,11845,11846,11847
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
1,YAL001C,,,,,,,0.246153,0.024975,-0.368841,-0.133006,0.278983,-0.759745
2,YAL002W,-0.841049,-7.68985,-0.141234,1.259199,-1.013566,0.640347,0.996179,-0.877163,-0.871235,0.065126,1.713675,2.705321
3,YAL003W,,,,,,,-1.331451,1.142709,-0.122686,1.462129,0.224788,-0.668123
1863,YAL004W,-0.294741,-1.022835,0.04994,0.16593,-0.076213,-0.115223,0.546199,-0.418754,0.175109,-0.443581,-0.157899,0.538201
4,YAL005C,-1.223968,-0.898407,-0.490983,-0.038527,0.104799,-0.121057,-0.346914,-0.340998,0.615853,-2.484278,-0.764022,-0.683653


# Normalize

In [49]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [50]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [51]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,11836,11838,11839,11840,11841,11842,11837,11843,11844,11845,...,11839,11840,11841,11842,11837,11843,11844,11845,11846,11847
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,value,value,value,value,...,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,YAL001C,,,,,,,0.246153,0.024975,-0.368841,-0.133006,...,,,,,0.316843,0.011792,-0.347664,-0.041522,0.369486,-0.838583
2,YAL002W,-0.841049,-7.68985,-0.141234,1.259199,-1.013566,0.640347,0.996179,-0.877163,-0.871235,0.065126,...,-0.177743,1.149992,-1.126047,0.574124,1.196978,-1.024149,-0.94154,0.191383,2.053873,3.254771
3,YAL003W,,,,,,,-1.331451,1.142709,-0.122686,1.462129,...,,,,,-1.534433,1.295306,-0.056686,1.833567,0.305858,-0.730348
1863,YAL004W,-0.294741,-1.022835,0.04994,0.16593,-0.076213,-0.115223,0.546199,-0.418754,0.175109,-0.443581,...,0.032735,0.03362,-0.152013,-0.233972,0.668939,-0.49775,0.295336,-0.406605,-0.143432,0.694707
4,YAL005C,-1.223968,-0.898407,-0.490983,-0.038527,0.104799,-0.121057,-0.346914,-0.340998,0.615853,-2.484278,...,-0.562809,-0.175157,0.036082,-0.240212,-0.379105,-0.408462,0.816336,-2.805456,-0.855046,-0.748695


# Print out

In [52]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [53]:
from IO.save_data_to_db3 import *

In [54]:
save_data_to_db(data_all, paper_pmid)

Deleting all datasets for PMID 22384346...


  0%|          | 0/12 [00:00<?, ?it/s]

Inserting the new data...


100%|██████████| 12/12 [01:47<00:00,  9.00s/it]

Updating the data_modified_on field...



