In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 23624539
paper_name = 'galvan_marquez_smith_2013' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/Chitosan Effect on GDA (raw data). Exp. 1 to 3. Imelda Galvan, 2013-1.xlsx', 
                            sheet_name='Sheet1')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 6144 x 30


In [7]:
original_data.head()

Unnamed: 0,Gene,Systematic Name,Plate X,Plate Y,Exp,Plate,Row,Col,Control Area,Test Area,...,Ratio.1,Std Deviation.1,Candidate.1,Unnamed: 23,Exp.2,Control Area.2,Test Area.2,Ratio.2,Std Deviation.2,Candidate.2
0,TCI1,YDR161W,1,1,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,,3.0,0.0,0.0,0.0,0.0,0.0
1,YCR017C,YCR017C,1,2,1.0,1.0,1.0,2.0,679.0,364.0,...,0.49,-0.82,0.0,,3.0,679.0,0.0,0.0,0.0,0.0
2,NBP2,YDR162C,1,3,1.0,1.0,1.0,3.0,526.0,414.0,...,0.76,1.12,0.0,,3.0,526.0,265.0,0.5,-0.63,0.0
3,MAK32,YCR019W,1,4,1.0,1.0,1.0,4.0,688.0,566.0,...,0.88,2.07,0.0,,3.0,688.0,521.0,0.76,1.28,0.0
4,YDR163W,YDR163W,1,5,1.0,1.0,1.0,5.0,583.0,374.0,...,0.48,-0.91,0.0,,3.0,583.0,79.0,0.14,-3.4,1.0


In [8]:
original_data['orf'] = original_data['Systematic Name'].astype(str)

In [9]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [15]:
original_data.loc[original_data['orf']=='YPL072WA','orf'] = 'YPL072W'

In [16]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [17]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

            Gene Systematic Name  Plate X  Plate Y  Exp  Plate  Row   Col  \
index_input                                                                 
2716         NaN             NaN        2        5  1.0    8.0  2.0   5.0   
2718         NaN             NaN        2        7  1.0    8.0  2.0   7.0   
2720         NaN             NaN        2        9  1.0    8.0  2.0   9.0   
2722         NaN             NaN        2       11  1.0    8.0  2.0  11.0   
2724         NaN             NaN        2       13  1.0    8.0  2.0  13.0   
...          ...             ...      ...      ...  ...    ...  ...   ...   
6139         NaN             NaN       16       20  NaN    NaN  NaN   NaN   
6140         NaN             NaN       16       21  NaN    NaN  NaN   NaN   
6141         NaN             NaN       16       22  NaN    NaN  NaN   NaN   
6142         NaN             NaN       16       23  NaN    NaN  NaN   NaN   
6143         NaN             NaN       16       24  NaN    NaN  NaN   NaN   

In [19]:
original_data = original_data.loc[t,:]

In [20]:
data_cols = [x for x in original_data.columns if 'Ratio' in x]

In [23]:
original_data['data'] = original_data[data_cols].mean(axis=1)

In [22]:
original_data.set_index('orf', inplace=True)

In [24]:
original_data = original_data[['data']].copy()

In [25]:
original_data = original_data.groupby(original_data.index).mean()

In [26]:
original_data.shape

(4645, 1)

# Prepare the final dataset

In [27]:
data = original_data.copy()

In [28]:
dataset_ids = [129]
datasets = datasets.reindex(index=dataset_ids)

In [29]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [30]:
data.head()

dataset_id,129
data_type,value
orf,Unnamed: 1_level_2
YAL002W,0.29
YAL004W,0.63
YAL005C,0.576667
YAL007C,0.533333
YAL008W,0.62


## Subset to the genes currently in SGD

In [31]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 21


In [32]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,129
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,0.29
1863,YAL004W,0.63
4,YAL005C,0.576667
5,YAL007C,0.533333
6,YAL008W,0.62


# Normalize

In [33]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [34]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [35]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,129,129
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,0.29,-0.267124
1863,YAL004W,0.63,-0.061794
4,YAL005C,0.576667,-0.094003
5,YAL007C,0.533333,-0.120172
6,YAL008W,0.62,-0.067833


# Print out

In [36]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [37]:
from IO.save_data_to_db3 import *

In [38]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 23624539...
Inserting the new data...


100%|██████████| 1/1 [00:07<00:00,  7.65s/it]

Updating the data_modified_on field...



