In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 22511765
paper_name = 'kim_cunningham_2012' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/jbc.M112.363390-1.xlsx', sheet_name='sup table 1', skiprows=1)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4847 x 29


In [10]:
original_data.head()

Unnamed: 0,HD-FK,LD+FK,Fig. 2,group,ORF,Name,TM,TMFK,TM∆,DT,...,Unnamed: 19,ID,batch,plate,col,row,Unnamed: 25,plate.1,col.1,row.1
0,7.0,,1.0,HACS,YGR217W,CCH1,82.8,96.3,78.488372,51.4,...,,1294,3,14,D,11,12.0,2.0,C,8.0
1,3.0,,2.0,HACS,YNL291C,MID1,89.8,95.7,57.843137,75.9,...,,470,2,5,H,3,12.0,1.0,E,4.0
2,39.0,,3.0,HACS,YLR443W,ECM7,45.9,95.7,92.051756,16.1,...,,3473,8,37,B,6,12.0,4.0,C,8.0
3,1.0,,4.0,CWI,YHR030C,SLT2,82.8,93.9,64.534884,93.4,...,,1343,3,14,H,12,12.0,2.0,D,5.0
4,20.0,,5.0,CWI,YDL203C,ACK1,43.8,77.2,59.430605,24.5,...,,3598,8,38,D,11,12.0,4.0,D,5.0


In [11]:
original_data['orf'] = original_data['ORF'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [15]:
original_data.loc[original_data['orf'] == 'YLR287-A','orf'] = 'YLR287C-A'

In [16]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [17]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [HD-FK, LD+FK, Fig. 2, group, ORF, Name, TM, TMFK, TM∆, DT, DTFK, DT∆, Unnamed: 12, Z(lnTM), Z(lnTMFK), Z(TM∆), Z(lnDT), Z(lnDTFK), Z(DT∆), Unnamed: 19, ID, batch, plate, col, row, Unnamed: 25, plate.1, col.1, row.1, orf]
Index: []

[0 rows x 30 columns]


In [21]:
original_data.set_index('orf', inplace=True)

In [22]:
original_data = original_data[['Z(lnTM)', 'Z(lnTMFK)']].copy()

In [23]:
original_data = original_data.groupby(original_data.index).mean()

In [24]:
original_data.shape

(4760, 2)

# Prepare the final dataset

In [25]:
data = original_data.copy()

In [26]:
dataset_ids = [19, 243]
datasets = datasets.reindex(index=dataset_ids)

In [27]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [28]:
data.head()

dataset_id,19,243
data_type,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2
YAL002W,1.253773,2.266779
YAL004W,3.350862,1.551318
YAL005C,3.066006,1.451332
YAL007C,0.277676,0.842089
YAL008W,-0.786175,0.378658


## Subset to the genes currently in SGD

In [29]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 23


In [30]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,19,243
Unnamed: 0_level_1,data_type,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,1.253773,2.266779
1863,YAL004W,3.350862,1.551318
4,YAL005C,3.066006,1.451332
5,YAL007C,0.277676,0.842089
6,YAL008W,-0.786175,0.378658


# Normalize

In [31]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [32]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [33]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,19,243,19,243
Unnamed: 0_level_1,data_type,value,value,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2,YAL002W,1.253773,2.266779,1.408607,2.121214
1863,YAL004W,3.350862,1.551318,3.471909,1.439936
4,YAL005C,3.066006,1.451332,3.191642,1.344727
5,YAL007C,0.277676,0.842089,0.448236,0.764592
6,YAL008W,-0.786175,0.378658,-0.598475,0.323302


# Print out

In [34]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [35]:
from IO.save_data_to_db3 import *

In [36]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/2 [00:00<?, ?it/s]

Deleting all datasets for PMID 22511765...
Inserting the new data...


100%|██████████| 2/2 [00:15<00:00,  7.97s/it]

Updating the data_modified_on field...



