In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 24211263
paper_name = 'teng_hardwick_2013' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['pmid', 'name'])

In [4]:
datasets.set_index('pmid', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/mmc5.xlsx', sheet_name='749 YKOs with low aa overgrowth')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 753 x 17


In [7]:
original_data = original_data.rename(columns={'ORF name of YKOs with overgrowth phenotype':'orfs'})

In [8]:
original_data['orfs'] = original_data['orfs'].astype(str)

In [9]:
# Eliminate all white spaces & capitalize
original_data['orfs'] = clean_orf(original_data['orfs'])

In [10]:
# Translate to ORFs 
original_data['orfs'] = translate_sc(original_data['orfs'], to='orf')

In [11]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orfs'])
print(original_data.loc[~t,])

            orfs Gene name Back-ground strain  plate           row  column  \
index_input                                                                  
344          NAN       NaN                NaN    NaN           NaN     NaN   
630          NAN       NaN                NaN    NaN           NaN     NaN   
751          NAN       NaN                NaN    NaN           NaN     NaN   
752          NAN       NaN                NaN    NaN  TOTAL tested     NaN   

             Number of substrains tested  Variant substrains in HR assay  \
index_input                                                                
344                                 1029                           111.0   
630                                 1709                           142.0   
751                                 1071                            76.0   
752                                 3809                             NaN   

             Variant substrains in low amino acid assay  \
index_input    

In [12]:
original_data = original_data.loc[t,:]

In [13]:
original_data['data'] = 1

In [14]:
original_data.set_index('orfs', inplace=True)
original_data.index.name='orf'

In [23]:
original_data = original_data.groupby(original_data.index).mean()

# Prepare the final dataset

In [24]:
data = original_data[['data']].copy()

In [25]:
dataset_ids = [16393]
datasets = datasets.reindex(index=dataset_ids)

In [26]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [27]:
data.head()

dataset_id,16393
data_type,value
orf,Unnamed: 1_level_2
YAL002W,1
YAL009W,1
YAL013W,1
YAL020C,1
YAL021C,1


## Subset to the genes currently in SGD

In [28]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 2


In [29]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,16393
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,1
7,YAL009W,1
11,YAL013W,1
18,YAL020C,1
19,YAL021C,1


# Normalize

In [30]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [31]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [32]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,16393,16393
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,1,2.489224
7,YAL009W,1,2.489224
11,YAL013W,1,2.489224
18,YAL020C,1,2.489224
19,YAL021C,1,2.489224


# Print out

In [33]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [34]:
from IO.save_data_to_db3 import *

In [35]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 24211263...
Inserting the new data...


100%|██████████| 1/1 [00:01<00:00,  1.37s/it]

Updating the data_modified_on field...



