In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 20333241
paper_name = 'chavel_cullen_2010' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/journal.pgen.1000883.s011.xlsx', sheet_name='Complete Screen', skiprows=2)

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 5229 x 13


In [10]:
original_data.head()

Unnamed: 0,Gene,ORF,Position,Phenotype,Raw data,Normalized data,False Positives a,Comments,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,LDB16,YCL005W,16B2,Undersecreter,2.04,2.3868,,,,,,,
1,0,YCL002C,16B1,Undersecreter,2.36,2.7612,,,,,,,
2,RPS0A,YGR214W,14D9,Undersecreter,2.41,3.08721,,,,,,,
3,KIM1,YOR008C-A,70H12,Undersecreter,2.81,4.51005,,,,,,,
4,LSC2,YGR244C,26H9,Undersecreter,3.6,4.2408,,,,,,,


In [11]:
original_data['orf'] = original_data['ORF'].astype(str)

In [12]:
# Eliminate all white spaces & capitalize
original_data['orf'] = clean_orf(original_data['orf'])

In [16]:
original_data.loc[original_data['orf']=='YLR287-A','orf'] = 'YLR287C-A'

In [17]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['orf'], to='orf')

In [18]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

            Gene       ORF Position                        Phenotype  \
index_input                                                            
337          NaN    Blank      71G4  Not determined - Blank position   
338          NaN   Blank16     16B4  Not determined - Blank position   
339          NaN    Blank     71F10  Not determined - Blank position   
340          NaN  BlankH48     48H2  Not determined - Blank position   
341          NaN  BlankH35     35H2  Not determined - Blank position   
...          ...       ...      ...                              ...   
5224         NaN       NaN      NaN                              NaN   
5225         NaN       NaN      NaN                              NaN   
5226         NaN       NaN      NaN                              NaN   
5227         NaN       NaN      NaN                              NaN   
5228         NaN       NaN      NaN                              NaN   

             Raw data  Normalized data False Positives a Commen

In [20]:
original_data = original_data.loc[t,:]

In [23]:
# Remove false positives
original_data = original_data.loc[original_data['False Positives a'].isnull(),:]

In [24]:
original_data.head()

Unnamed: 0_level_0,Gene,ORF,Position,Phenotype,Raw data,Normalized data,False Positives a,Comments,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,orf
index_input,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,LDB16,YCL005W,16B2,Undersecreter,2.04,2.3868,,,,,,,,YCL005W
1,0,YCL002C,16B1,Undersecreter,2.36,2.7612,,,,,,,,YCL002C
2,RPS0A,YGR214W,14D9,Undersecreter,2.41,3.08721,,,,,,,,YGR214W
3,KIM1,YOR008C-A,70H12,Undersecreter,2.81,4.51005,,,,,,,,YOR008C-A
4,LSC2,YGR244C,26H9,Undersecreter,3.6,4.2408,,,,,,,,YGR244C


In [25]:
original_data['data'] = original_data['Normalized data'].astype(float)

In [26]:
original_data.set_index('orf', inplace=True)

In [27]:
original_data = original_data[['data']].copy()

In [28]:
original_data = original_data.groupby(original_data.index).mean()

In [29]:
original_data.shape

(4564, 1)

# Prepare the final dataset

In [30]:
data = original_data.copy()

In [31]:
dataset_ids = [11]
datasets = datasets.reindex(index=dataset_ids)

In [32]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [33]:
data.head()

dataset_id,11
data_type,value
orf,Unnamed: 1_level_2
YAL002W,7.22139
YAL007C,41.42535
YAL008W,33.23169
YAL009W,33.43113
YAL010C,40.28688


## Subset to the genes currently in SGD

In [34]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 22


In [35]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,11
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
2,YAL002W,7.22139
5,YAL007C,41.42535
6,YAL008W,33.23169
7,YAL009W,33.43113
8,YAL010C,40.28688


# Normalize

In [36]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [37]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [38]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,11,11
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
2,YAL002W,7.22139,-2.640542
5,YAL007C,41.42535,1.11182
6,YAL008W,33.23169,0.212931
7,YAL009W,33.43113,0.23481
8,YAL010C,40.28688,0.986924


# Print out

In [39]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [40]:
from IO.save_data_to_db3 import *

In [41]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 20333241...
Inserting the new data...


100%|██████████| 1/1 [00:07<00:00,  7.38s/it]

Updating the data_modified_on field...



