In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 17220322
paper_name = 'kitagawa_akada_2007' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [8]:
original_data = pd.read_excel('raw_data/Table1.xlsx', sheet_name='Sheet1')

In [9]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 71 x 7


In [10]:
original_data.head()

Unnamed: 0.1,Unnamed: 0,Genea,Function,% of wild typeb:,Unnamed: 4,DNA contentc,Cell sized
0,,,,,,,
1,,,,Viability,Growth,,
2,,DNA metabolism,,,,,
3,DDC1,DDC1,DNA damage checkpoint protein,12,67,=,++
4,HPR5,HPR5,DNA helicase,5,40,≪,++


In [12]:
original_data['genes'] = original_data['Unnamed: 0'].astype(str)

In [14]:
# Eliminate all white spaces & capitalize
original_data['genes'] = clean_genename(original_data['genes'])

In [15]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['genes'], to='orf')

In [16]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

            Unnamed: 0                                  Genea Function  \
index_input                                                              
0                  NaN                                    NaN      NaN   
1                  NaN                                    NaN      NaN   
2                  NaN                         DNA metabolism      NaN   
28                 NaN                 Chromosome maintenance      NaN   
43                 NaN  Vesicular traffic and ion homeostasis      NaN   
55                 NaN                         RNA catabolism      NaN   
59                 NaN                          Morphogenesis      NaN   
64                 NaN                    Protein translation      NaN   
67                 NaN                      Nuclear transport      NaN   
69                 NaN                                Unknown      NaN   

            % of wild typeb: Unnamed: 4 DNA contentc Cell sized genes  orf  
index_input                       

In [17]:
original_data = original_data.loc[t,]

In [18]:
original_data['data'] = -1

In [19]:
original_data.set_index('orf', inplace=True)

In [20]:
original_data = original_data[['data']].copy()

In [21]:
original_data = original_data.groupby(original_data.index).mean()

In [22]:
original_data.shape

(61, 1)

# Prepare the final dataset

In [23]:
data = original_data.copy()

In [24]:
dataset_ids = [1303]
datasets = datasets.reindex(index=dataset_ids)

In [25]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [26]:
data.head()

dataset_id,1303
data_type,value
orf,Unnamed: 1_level_2
YBR098W,-1
YCL016C,-1
YCL061C,-1
YCR009C,-1
YCR044C,-1


## Subset to the genes currently in SGD

In [27]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [28]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,1303
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
293,YBR098W,-1
507,YCL016C,-1
546,YCL061C,-1
565,YCR009C,-1
601,YCR044C,-1


# Normalize

In [29]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [30]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [31]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,1303,1303
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
293,YBR098W,-1,-8.642252
507,YCL016C,-1,-8.642252
546,YCL061C,-1,-8.642252
565,YCR009C,-1,-8.642252
601,YCR044C,-1,-8.642252


# Print out

In [32]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [35]:
# from IO.save_data_to_db3 import *

In [36]:
# save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 17220322...
Inserting the new data...


100%|██████████| 1/1 [00:00<00:00,  4.04it/s]

Updating the data_modified_on field...



