In [1]:
%run ../../Utils/yp_utils.py

# Initial setup

In [2]:
paper_pmid = 19625222
paper_name = 'de_graaf_mccullough_2009' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [5]:
original_data = pd.read_excel('raw_data/Table1.xlsx', sheet_name='Sheet1')

In [6]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 4 x 2


In [7]:
original_data.head()

Unnamed: 0,Functional Category,Genes/ORFs
0,Metabolism……………………………..,SFA1 ERG3 ERG6 ERG5 PSD1 ADH1
1,Cell Cycle and DNA Processing…………………………….,SPT7 CDC50 RAD55 XRS2 RAD51 RAD4 CDC26 RAD54 M...
2,Transcription…………………………..,RPN4 SNF6 DAL81 LSM1 SWI3 SNF2 MED1
3,Miscellaneous…………………………,NBP2 VID22 ARP5 NUP84 VPS9 ECM30 OPI11 NRP1 YL...


In [9]:
genes = [g for r in original_data['Genes/ORFs'] for g in r.split(' ')]

In [11]:
original_data = pd.DataFrame(data={'gene': genes, 'data': -1})

In [12]:
original_data['gene'] = original_data['gene'].astype(str)

In [13]:
# Eliminate all white spaces & capitalize
original_data['gene'] = clean_genename(original_data['gene'])

In [14]:
# Translate to ORFs 
original_data['orf'] = translate_sc(original_data['gene'], to='orf')

In [15]:
# Make sure everything translated ok
t = looks_like_orf(original_data['orf'])
print(original_data.loc[~t,])

Empty DataFrame
Columns: [gene, data, orf]
Index: []


In [16]:
original_data.set_index('orf', inplace=True)

In [17]:
original_data = original_data[['data']].copy()

In [18]:
original_data = original_data.groupby(original_data.index).mean()

In [19]:
original_data.shape

(44, 1)

# Prepare the final dataset

In [20]:
data = original_data.copy()

In [21]:
dataset_ids = [161]
datasets = datasets.reindex(index=dataset_ids)

In [22]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [23]:
data.head()

dataset_id,161
data_type,value
orf,Unnamed: 1_level_2
YBR081C,-1
YCR094W,-1
YDL020C,-1
YDL116W,-1
YDL167C,-1


## Subset to the genes currently in SGD

In [24]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 0


In [25]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,161
Unnamed: 0_level_1,data_type,value
gene_id,orf,Unnamed: 2_level_2
276,YBR081C,-1
642,YCR094W,-1
1899,YDL020C,-1
1994,YDL116W,-1
2046,YDL167C,-1


# Normalize

In [26]:
data_norm = normalize_phenotypic_scores(data, has_tested=False)

In [27]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [28]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,161,161
Unnamed: 0_level_1,data_type,value,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2
276,YBR081C,-1,-10.174612
642,YCR094W,-1,-10.174612
1899,YDL020C,-1,-10.174612
1994,YDL116W,-1,-10.174612
2046,YDL167C,-1,-10.174612


# Print out

In [29]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [30]:
from IO.save_data_to_db3 import *

In [31]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting all datasets for PMID 19625222...
Inserting the new data...


100%|██████████| 1/1 [00:00<00:00,  1.58it/s]

Updating the data_modified_on field...



