In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 20973990
paper_name = 'mira_sa_correia_2010' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/1475-2859-9-79-s1.xlsx', sheet_name='Sheet1', skiprows=4)

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 692 x 4


In [7]:
original_data.head()

Unnamed: 0,Gene,Function1,Susceptibility to acetic acid2,Unnamed: 3
0,,,,
1,Lipid metabolism,,,
2,,,,
3,CHO2,Phosphatidylethanolamine methyltransferase (PE...,+,
4,CRD1,"Cardiolipin synthase; produces cardiolipin, wh...",++,


In [8]:
original_data['gene'] = original_data['Gene'].astype(str)

In [9]:
# Eliminate all white spaces & capitalize
original_data['gene'] = clean_genename(original_data['gene'])

In [10]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['gene'], to='orf')

In [11]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

                                                          Gene Function1  \
index_input                                                                
0                                                          NaN       NaN   
1                                             Lipid metabolism       NaN   
2                                                          NaN       NaN   
20                                                         NaN       NaN   
21                                          Response to stress       NaN   
29                                                         NaN       NaN   
30                                      Mitochondrial function       NaN   
31                                                         NaN       NaN   
71                                                         NaN       NaN   
72           Chromatin remodelling, nucleic acid metabolism...       NaN   
171                                                        NaN       NaN   
172         

In [12]:
original_data = original_data.loc[t,:]

In [13]:
original_data['data'] = original_data['Susceptibility to acetic acid2'].apply(lambda x: -len(x) if isinstance(x, str) else 0)

In [14]:
original_data.set_index('orf', inplace=True)

In [15]:
original_data = original_data[['data']].copy()

In [16]:
original_data = original_data.groupby(original_data.index).mean()

In [17]:
original_data.shape

(625, 1)

# Load & process tested strains

In [18]:
# tested = pd.read_excel('raw_data/List of strains tested.xlsx', sheet_name='Tabelle2')

In [19]:
# tested.head()

In [20]:
# tested['orf'] = tested['ORF'].astype(str)

In [21]:
# tested['orf'] = clean_orf(tested['orf'])

In [22]:
# tested['orf'] = translate_sc(tested['orf'], to='orf')

In [23]:
# # Make sure everything translated ok
# t = looks_like_orf(tested['orf'])
# print(tested.loc[~t,])

In [24]:
# tested_orfs = tested['orf'].unique()

In [25]:
# missing = [orf for orf in original_data.index.values if orf not in tested_orfs]
# missing

In [26]:
# tested_orfs = list(tested_orfs) + missing

In [27]:
# original_data = original_data.reindex(index=tested_orfs, fill_value=0)

# Prepare the final dataset

In [28]:
data = original_data.copy()

In [29]:
dataset_ids = [101]
datasets = datasets.reindex(index=dataset_ids)

In [30]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [31]:
data.head()

dataset_id,101
data_type,value
orf,Unnamed: 1_level_2
YAL002W,-1.0
YAL011W,-1.0
YAL012W,-1.0
YAL013W,-1.0
YAL021C,-2.0


## Subset to the genes currently in SGD

In [32]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [33]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,101
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,-1.0
9,YAL011W,-1.0
10,YAL012W,-1.0
11,YAL013W,-1.0
19,YAL021C,-2.0


# Normalize

In [34]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [35]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [36]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,101,101
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,-1.0,-1.71965
9,YAL011W,-1.0,-1.71965
10,YAL012W,-1.0,-1.71965
11,YAL013W,-1.0,-1.71965
19,YAL021C,-2.0,-3.4393


# Print out

In [37]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [41]:
# from IO.save_data_to_db3 import *

In [42]:
# save_data_to_db(data_all, paper_pmid)