In [2]:
import pandas as pd

## BMIQ Normalization

Here we are running [BMIQ normalization](http://www.ncbi.nlm.nih.gov/pubmed/23175756) on all of our quantile-normalized data together.  I am using the implementation provided by Steve Horvath along with his recent methylation-age paper.  We are doing this as a result of [this paper's](http://www.ncbi.nlm.nih.gov/pubmed/23422812) recomendation to run both quantile normalization and BMIQ in series on the same datasets.  

In [1]:
import pandas as pd

import rpy2.robjects as robjects
from pandas.rpy.common import convert_to_r_dataframe
from pandas.rpy.common import convert_robj

from IPython.display import clear_output

Load Horvath normalization source into R namespace.

Read in cell composition adjusted, quantile-normalized beta values and cell counts from the MINFI pipeline.

In [2]:
c = [u'3001', u'3002', u'3003', u'3004', u'3005', 
     u'3006', u'3007', u'3008', u'3009', u'3010']

In [3]:
betas = pd.read_hdf('/data_ssd/methylation_norm.h5', 'betas_adj', columns=c)

In [4]:
gold_standard_ah = pd.read_hdf('/data_ssd/methylation_norm.h5', 'Hannum_gold_standard')

In [5]:
betas = betas.ix[gold_standard_ah.index]

In [6]:
if betas.isnull().sum().sum() > 0:
    betas = betas.T.fillna(gold_standard_ah).T

In [7]:
robjects.r.library('WGCNA');
robjects.r.source("/cellar/users/agross/Data/MethylationAge/Horvath/NORMALIZATION.R");
clear_output()

In [8]:
df_r = robjects.r.t(convert_to_r_dataframe(betas))
gs = list(gold_standard_ah.ix[betas.index])
gs_r = robjects.FloatVector(gs)

In [9]:
del betas

In [10]:
data_n = robjects.r.BMIQcalibration(df_r, gs_r)
data_n = convert_robj(data_n).T
clear_output()

In [11]:
data_n.columns = data_n.columns.map(lambda s: s.replace('.','-'))
data_n.columns = data_n.columns.map(lambda s: s[1:] if s.startswith('X') else s)

In [14]:
store = pd.HDFStore('/data_ssd/methylation_norm_tmp.h5')

In [13]:
ls

Unnamed: 0,3001,3002,3003,3004,3005,3006,3007,3008,3009,3010
cg00000029,0.592240,0.510267,0.685360,0.359762,0.453208,0.378323,0.777999,0.540455,0.375063,0.437074
cg00000108,0.752802,0.911826,0.907814,0.915458,0.910821,0.708800,0.906399,0.778826,0.718821,0.797875
cg00000109,0.757381,0.868124,0.807706,0.883148,0.904858,0.713249,0.742634,0.770461,0.706041,0.797936
cg00000165,0.594512,0.338608,0.162821,0.183733,0.531514,0.172554,0.176201,0.228595,0.123471,0.436144
cg00000236,0.634071,0.750260,0.749771,0.750806,0.801952,0.585840,0.764816,0.644744,0.609347,0.664135
cg00000289,0.639025,0.637054,0.578818,0.538156,0.638393,0.498380,0.649539,0.662585,0.639450,0.530409
cg00000292,0.734110,0.795642,0.869419,0.871464,0.668047,0.496161,0.867685,0.756308,0.705505,0.705127
cg00000321,0.200406,0.339578,0.287694,0.406807,0.578005,0.477117,0.528716,0.354002,0.139486,0.411181
cg00000363,0.173539,0.306196,0.279568,0.452325,0.594190,0.405638,0.183061,0.258960,0.144735,0.413206
cg00000622,0.070141,0.074719,0.092488,0.080013,0.079370,0.081252,0.076388,0.090108,0.088746,0.064896


In [13]:
#store = pd.HDFStore('/data_ssd/methylation_norm.h5')
#store.append('quant_BMIQ_adj', data_n)
#store.create_table_index('quant_BMIQ_adj', optlevel=9, kind='full')