In [1]:
%run ../yp_utils.py

# Initial setup

In [2]:
paper_pmid = 22751784
paper_name = 'singh_babak_cowen_2012' 

In [3]:
datasets = pd.read_csv('extras/YeastPhenome_' + str(paper_pmid) + '_datasets_list.txt', sep='\t', header=None, names=['dataset_id', 'name'])

In [4]:
datasets.set_index('dataset_id', inplace=True)

# Load & process the data

In [27]:
original_data = pd.read_excel('raw_data/Singh-Babak2012_HIPHOPData.xlsx', sheet_name='Sheet1')

In [28]:
print('Original data dimensions: %d x %d' % (original_data.shape))

Original data dimensions: 5968 x 17


In [29]:
original_data.head()

Unnamed: 0,strain,Gene,z_score:DMSO:2:%:YPD,ratio:DMSO:2:%:YPD,z_score:CsA:10uM:YPD,ratio:CsA10:uM:YPD,z_score:Micafungin:0.045ug/ml:YPD,ratio:Micafungin:0.045ug/ml:YPD,z_score:Micafungin:0.045ug/ml:CsA:10uM:YPD,ratio:Micafungin:0.045ug/ml:CsA:10uM:YPD,Description,feature_qualifier,GO_process,GO_function,GO_component,essential_gene,zygosity
0,YDR064W::chr4_4,RPS13,3.29052,1.14217,4.84532,1.65613,-2.10947,-0.67023,0.964944,0.35143,Protein component of the small (40S) ribosomal...,Verified,translation,structural constituent of ribosome,cytosolic small ribosomal subunit (sensu Eukar...,yes,het
1,YDR230W::chr4_5,YDR230W,1.94163,0.409216,4.48081,0.883803,4.02308,0.779177,2.19615,0.428502,,Dubious,,,,no,hom
2,YBR069C::chr2_2,TAT1,3.27743,1.08031,4.46854,1.46252,2.44488,0.816327,4.09342,1.33423,"Amino acid transport protein for valine, leuci...",Verified,amino acid transport,amino acid transporter activity,plasma membrane,no,hom
3,YOR330C::chr15_4,MIP1,3.29937,0.743917,4.15444,0.932741,2.75272,0.623232,-0.285326,-0.047535,Catalytic subunit of the mitochondrial DNA pol...,Verified,DNA-dependent DNA replication*,gamma DNA-directed DNA polymerase activity,mitochondrion,no,hom
4,YPL169C::chr16_2,MEX67,2.72995,0.679498,4.12155,1.01573,-0.1917,-0.027006,1.82973,0.461834,Poly(A)RNA binding protein involved in nuclear...,Verified,mRNA export from nucleus,protein binding*,cytoplasm*,yes,het


In [30]:
original_data1 = original_data.loc[original_data['zygosity'] == 'hom',:].copy()
original_data2 = original_data.loc[original_data['zygosity'] == 'het',:].copy()

original_data_list = [original_data1, original_data2]

In [32]:
original_data_list2 = []
for original_data in original_data_list:
    original_data['orf'] = original_data['strain'].apply(lambda x: x.split(':')[0])
    # Eliminate all white spaces & capitalize
    original_data['orf'] = clean_orf(original_data['orf'])
    # Translate to ORFs 
    original_data['orf'] = translate_sc(original_data['orf'].values, to='orf')
    # Make sure everything translated ok
    t = looks_like_orf(original_data['orf'])
    print(original_data.loc[~t,])
    original_data.set_index('orf', inplace=True)
    data_cols = [c for c in original_data.columns.values if 'z_score' in c]
    original_data = original_data[data_cols]
    original_data = original_data.groupby(original_data.index).mean()
    original_data = original_data.sub(original_data['z_score:DMSO:2:%:YPD'], axis=0)
    original_data.drop(columns = ['z_score:DMSO:2:%:YPD'], inplace=True)
    
    original_data_list2.append(original_data)

Empty DataFrame
Columns: [strain, Gene, z_score:DMSO:2:%:YPD, ratio:DMSO:2:%:YPD, z_score:CsA:10uM:YPD, ratio:CsA10:uM:YPD, z_score:Micafungin:0.045ug/ml:YPD, ratio:Micafungin:0.045ug/ml:YPD, z_score:Micafungin:0.045ug/ml:CsA:10uM:YPD, ratio:Micafungin:0.045ug/ml:CsA:10uM:YPD, Description, feature_qualifier, GO_process, GO_function, GO_component, essential_gene, zygosity, orf]
Index: []
Empty DataFrame
Columns: [strain, Gene, z_score:DMSO:2:%:YPD, ratio:DMSO:2:%:YPD, z_score:CsA:10uM:YPD, ratio:CsA10:uM:YPD, z_score:Micafungin:0.045ug/ml:YPD, ratio:Micafungin:0.045ug/ml:YPD, z_score:Micafungin:0.045ug/ml:CsA:10uM:YPD, ratio:Micafungin:0.045ug/ml:CsA:10uM:YPD, Description, feature_qualifier, GO_process, GO_function, GO_component, essential_gene, zygosity, orf]
Index: []


In [33]:
original_data = pd.concat(original_data_list2, axis=1)

In [36]:
original_data.shape

(5854, 6)

In [38]:
original_data.index.name = 'orf'

# Prepare the final dataset

In [39]:
data = original_data.copy()

In [40]:
dataset_ids = [5346, 5348, 5350, 5343, 5347, 5349]
datasets = datasets.reindex(index=dataset_ids)

In [41]:
lst = [datasets.index.values, ['value']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data.columns = idx

In [42]:
data.head()

dataset_id,5346,5348,5350,5343,5347,5349
data_type,value,value,value,value,value,value
orf,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
YAL002W,0.341656,0.569368,0.533673,,,
YAL004W,0.134467,0.526533,-0.141512,,,
YAL005C,-1.091689,-1.428365,-1.611746,,,
YAL007C,2.296222,2.180659,1.277347,,,
YAL008W,0.427628,-0.436984,0.056136,,,


## Subset to the genes currently in SGD

In [43]:
genes = pd.read_csv(path_to_genes, sep='\t', index_col='id')
genes = genes.reset_index().set_index('systematic_name')
gene_ids = genes.reindex(index=data.index.values)['id'].values
num_missing = np.sum(np.isnan(gene_ids))
print('ORFs missing from SGD: %d' % num_missing)

ORFs missing from SGD: 24


In [44]:
data['gene_id'] = gene_ids
data = data.loc[data['gene_id'].notnull()]
data['gene_id'] = data['gene_id'].astype(int)
data = data.reset_index().set_index(['gene_id','orf'])

data.head()

Unnamed: 0_level_0,dataset_id,5346,5348,5350,5343,5347,5349
Unnamed: 0_level_1,data_type,value,value,value,value,value,value
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2,YAL002W,0.341656,0.569368,0.533673,,,
1863,YAL004W,0.134467,0.526533,-0.141512,,,
4,YAL005C,-1.091689,-1.428365,-1.611746,,,
5,YAL007C,2.296222,2.180659,1.277347,,,
6,YAL008W,0.427628,-0.436984,0.056136,,,


# Normalize

In [45]:
data_norm = normalize_phenotypic_scores(data, has_tested=True)

In [46]:
# Assign proper column names
lst = [datasets.index.values, ['valuez']*datasets.shape[0]]
tuples = list(zip(*lst))
idx = pd.MultiIndex.from_tuples(tuples, names=['dataset_id','data_type'])
data_norm.columns = idx

In [47]:
data_norm[data.isnull()] = np.nan
data_all = data.join(data_norm)

data_all.head()

Unnamed: 0_level_0,dataset_id,5346,5348,5350,5343,5347,5349,5346,5348,5350,5343,5347,5349
Unnamed: 0_level_1,data_type,value,value,value,value,value,value,valuez,valuez,valuez,valuez,valuez,valuez
gene_id,orf,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2,YAL002W,0.341656,0.569368,0.533673,,,,0.459459,0.520268,0.541617,,,
1863,YAL004W,0.134467,0.526533,-0.141512,,,,0.214484,0.492815,0.024688,,,
4,YAL005C,-1.091689,-1.428365,-1.611746,,,,-1.235288,-0.760073,-1.10094,,,
5,YAL007C,2.296222,2.180659,1.277347,,,,2.770482,1.552939,1.110981,,,
6,YAL008W,0.427628,-0.436984,0.056136,,,,0.56111,-0.1247,0.176009,,,


# Print out

In [48]:
for f in ['value','valuez']:
    df = data_all.xs(f, level='data_type', axis=1).copy()
    df.columns = datasets['name'].values
    df = df.droplevel('gene_id', axis=0)
    df.to_csv(paper_name + '_' + f + '.txt', sep='\t')

# Save to DB

In [49]:
from IO.save_data_to_db3 import *

In [50]:
save_data_to_db(data_all, paper_pmid)

  0%|          | 0/6 [00:00<?, ?it/s]

Deleting all datasets for PMID 22751784...
Inserting the new data...


100%|██████████| 6/6 [00:45<00:00,  7.65s/it]

Updating the data_modified_on field...



