In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 20691087
paper_name = 'alamgir_golshani_2010' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [26]:
original_data = pd.read_excel('raw_data/1472-6769-10-6-s1.xlsx', sheet_name='Raw genome-wide data', skiprows=1)

In [27]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 6097 x 14


In [28]:
original_data.head()

Unnamed: 0,Systematic Name,Standard Name,3-AT,Unnamed: 3,Unnamed: 4,Cycloheximide,Unnamed: 6,Unnamed: 7,Streptomycin,Unnamed: 9,Unnamed: 10,Neomycin,Unnamed: 12,Unnamed: 13
0,YDR161W,TCI1,18.1,-37.552,48.809291,-24.835,-7.9355,-111.29,-14.990633,59.157762,70.395611,18.373604,56.480779,37.427191
1,YCR017C,YCR017C,-6.286,-29.366,23.623414,-25.611,15.271,-8.4096,-27.312799,33.190599,28.895269,12.526707,49.755869,31.141288
2,YDR162C,NBP2,6.6342,-1.5312,23.493208,-11.695,2.4591,6.2857,-40.302128,36.033953,32.921972,-8.874513,-7.113722,-7.994117
3,YCR019W,MAK32,-0.045249,38.101,2.979396,-26.019,5.3746,20.487,-20.738572,3.551175,29.330686,22.069155,18.693827,20.381491
4,YDR163W,YDR163W,11.377,38.206,38.920663,-1.0957,13.393,46.41,-30.16587,82.581561,51.215577,-10.904799,-42.271084,-26.587941


In [29]:
original_data['orf'] = original_data['Systematic Name'].astype(str)

In [30]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [31]:
original_data.loc[original_data['orf']=='YPL072WA','orf'] = 'YPL072W-A'

In [32]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [33]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

            Systematic Name Standard Name     3-AT  Unnamed: 3  Unnamed: 4  \
index_input                                                                  
2716                    NaN           NaN   0.0000      0.0000         0.0   
2718                    NaN           NaN   0.0000      0.0000         0.0   
2720                    NaN           NaN   0.0000      0.0000         0.0   
2722                    NaN           NaN   0.0000      0.0000         0.0   
2724                    NaN           NaN   0.0000      0.0000         0.0   
...                     ...           ...      ...         ...         ...   
6068                    NaN           NaN  -5.2861      2.6002         NaN   
6069                    NaN           NaN   0.0000      0.0000         NaN   
6070                    NaN           NaN  -1.1848     13.0710         NaN   
6071                    NaN           NaN   0.0000      0.0000         NaN   
6072                    NaN           NaN -21.7830     30.2370  

In [34]:
original_data = original_data.loc[t,:]

In [35]:
original_data.set_index('orf', inplace=True)

In [36]:
original_data = original_data.iloc[:,2:14]

In [37]:
original_data = -original_data.apply(pd.to_numeric, axis=1, errors='coerce')

In [38]:
original_data = original_data.groupby(original_data.index).mean()

In [39]:
original_data.shape

(4645, 12)

In [43]:
original_data.columns = [9, 9, 9, 10, 10, 10, 7, 7, 7, 8, 8, 8]

In [44]:
original_data = original_data.T
original_data = original_data.groupby(original_data.index).mean()
original_data = original_data.T

In [45]:
original_data.head()

Unnamed: 0_level_0,7,8,9,10
orf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
YAL002W,0.361654,-10.430456,-3.522839,-5.113
YAL004W,-3.009357,-6.797788,6.142922,-22.567633
YAL005C,-3.479459,-4.360418,-1.721913,4.844467
YAL007C,9.940971,5.861502,9.731506,2.763097
YAL008W,20.969247,2.199986,-8.984067,-18.213333


# Prepare the final dataset

In [46]:
data = original_data.copy()

In [47]:
dataset_ids = [7,8,9,10]
datasets = datasets.reindex(index=dataset_ids)

In [48]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [49]:
data.head()

dataset_id,7,8,9,10
data_type,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
YAL002W,0.361654,-10.430456,-3.522839,-5.113
YAL004W,-3.009357,-6.797788,6.142922,-22.567633
YAL005C,-3.479459,-4.360418,-1.721913,4.844467
YAL007C,9.940971,5.861502,9.731506,2.763097
YAL008W,20.969247,2.199986,-8.984067,-18.213333


## Subset to the genes currently in SGD

In [50]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [51]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,7,8,9,10
Unnamed: 0_level_1,data_type,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2,YAL002W,0.361654,-10.430456,-3.522839,-5.113
1863,YAL004W,-3.009357,-6.797788,6.142922,-22.567633
4,YAL005C,-3.479459,-4.360418,-1.721913,4.844467
5,YAL007C,9.940971,5.861502,9.731506,2.763097
6,YAL008W,20.969247,2.199986,-8.984067,-18.213333


# Normalize

In [52]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [53]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [54]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,7,8,9,10,7,8,9,10
Unnamed: 0_level_1,data_type,value,value,value,value,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2,YAL002W,0.361654,-10.430456,-3.522839,-5.113,0.021849,-0.640828,-0.244842,-0.290847
1863,YAL004W,-3.009357,-6.797788,6.142922,-22.567633,-0.20461,-0.412378,0.342729,-1.236119
4,YAL005C,-3.479459,-4.360418,-1.721913,4.844467,-0.236191,-0.259097,-0.135366,0.248408
5,YAL007C,9.940971,5.861502,9.731506,2.763097,0.665374,0.383737,0.560876,0.13569
6,YAL008W,20.969247,2.199986,-8.984067,-18.213333,1.406238,0.153472,-0.576825,-1.000308


# Print out

In [55]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [56]:
from IO.save_data_to_db3 import *

In [57]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/4 [00:00<?, ?it/s]

Deleting all datasets for PMID 20691087...
Inserting the new data...


100%|██████████| 4/4 [00:29<00:00,  7.32s/it]

Updating the data_modified_on field...



