In [1]:
#Implement linear regression in WEE
using DataFrames, GLM, Distributions, Optim

function WEE_cts(formula::DataFrames.Formula, D, data::DataFrames.DataFrame, pd_pop::Float64, boot::Int64 = 0)
    # Input formu, D, data, pd_pop, boot = 0
    mf = ModelFrame(formu, data)
    responseV = formu.lhs # get y (response varible)
    y = mf.df[responseV]
    
    namesx = DataFrames.allvars(formu)[1:end-1] #get string x1, x2
    
    xx = ModelMatrix(mf)
    
    temp_data = convert(DataFrame, xx.m[:,2:end])
    temp_data[:D] = D 
    temp_data[:y] = y
    
    n1 = sum(D.==1)
    n0 = sum(D.==0)
    
    # compute the weight p(D|X)
    
    formu.lhs = :D
    gamma=coef(glm(formu, temp_data, Binomial()))
    
    function PO(gamma0::Float64)
        gamma[1] = gamma0
        (mean(exp(xx.m * gamma)./(1+exp(xx.m * gamma))) - pd_pop)^2
    end
    
    # Get the argument when PO has the minimum value
    gamma[1] = optimize(PO, -100, 100).minimum
    temp_data[:estpx] = exp(xx.m * gamma)./(1+exp(xx.m * gamma));
    
    
    #estimate py in cases and controls separately
    formu.lhs = :y
    
    pyD1 = lm(formu, temp_data[temp_data[:D] .== 1,:]) # fit the case
    pyD0 = lm(formu, temp_data[temp_data[:D] .== 0,:]) # fit the control
    
    py1 = predict(pyD0, temp_data[temp_data[:D] .== 1,:]) # generate pseudo control
    py0 = predict(pyD1, temp_data[temp_data[:D] .== 0,:]) # generate pseudo case
    
    data1 = DataFrame(D = repmat([0], n1), y = py1)
    data1[namesx[1]] = temp_data[temp_data[:D] .== 1,:][1]
    data1[namesx[2]] = temp_data[temp_data[:D] .== 1,:][2]
    data1[:estpx] = temp_data[temp_data[:D] .== 1,:][:estpx]
    
    data0 = DataFrame(D = repmat([1], n0), y = py0)
    data0[namesx[1]] = temp_data[temp_data[:D] .== 0,:][1]
    data0[namesx[2]] = temp_data[temp_data[:D] .== 0,:][2]
    data0[:estpx] = temp_data[temp_data[:D] .== 0,:][:estpx]
    
    alldat = vcat(temp_data, data1, data0)
    
    alldat[alldat[:D] .== 0, :estpx] = 1 - alldat[alldat[:D] .== 0, :estpx]
    
    # the point estimate
    lm1 = glm(formu, alldat, Normal(), wts = convert(Array, alldat[:,:estpx]))
    cf = coef(lm1)[2:end]
    
    function PO(gamma0::Float64)
        gamma[1] = gamma0
        (mean(exp(xx.m * gamma)./(1+exp(xx.m * gamma))) - pd_pop)^2
    end
    
    #bootstrap SE
    bootcoef = DataFrame() # Build an empty dataframe
    for n = 1:size(namesx)[1]
        bootcoef[Symbol(namesx[n])] = Float64[]
    end
    
    if boot == 0
        push!(bootcoef, cf)
        bootcoef
    else
        sample_cases = temp_data[temp_data[:D] .== 1,:]
        sample_controls = temp_data[temp_data[:D] .== 0,:]
        
        
        for iboot in 1:boot
            
            boot_cases_sample = sample_cases[sample(1:n1, n1, replace = true), :]
            boot_controls_sample = sample_controls[sample(1:n0, n0, replace = true), :]
            bootsample = vcat(boot_cases_sample, boot_controls_sample)
            
            bootmf = ModelFrame(formu, bootsample)
            bootxx = ModelMatrix(bootmf)
            
            # compute the weight p(D|X) 
            formu.lhs = :D
            gamma = coef(glm(formu, bootsample, Binomial()))
            
            function Boot_PO(gamma0::Float64)
                gamma[1] = gamma0
                (mean(exp(bootxx.m * gamma)./(1+exp(bootxx.m * gamma))) - pd_pop)^2
            end
            
            gamma[1] = optimize(Boot_PO, -100, 100).minimum
            bootsample[:estpx] = exp(bootxx.m * gamma)./(1+exp(bootxx.m * gamma))
        
            formu.lhs = :y
            
            pyD1 = lm(formu, boot_cases_sample) # fit the case
            pyD0 = lm(formu, boot_controls_sample) # fit the control
            
            py1 = predict(pyD0, boot_cases_sample) # generate pseudo control
            py0 = predict(pyD1, boot_controls_sample) # generate pseudo case
            
            data1 = DataFrame(D = repmat([0], n1), y = py1)
            data1[namesx[1]] = boot_cases_sample[1]
            data1[namesx[2]] = boot_cases_sample[2]
            data1[:estpx] = boot_cases_sample[:estpx]
            
            data0 = DataFrame(D = repmat([1], n0), y = py0)
            data0[namesx[1]] = boot_controls_sample[1]
            data0[namesx[2]] = boot_controls_sample[2]
            data0[:estpx] = boot_controls_sample[:estpx]
            
            alldat = vcat(bootsample, data1, data0)
            alldat[alldat[:D] .== 0, :estpx] = 1 - alldat[alldat[:D] .== 0, :estpx]
            
            push!(bootcoef, coef(glm(formu, alldat, Normal(), wts = convert(Array, alldat[:,:estpx])))[2:end])
            
        end
        
        var_num = var(Array(bootcoef), 1)
        chisq = (cf.^2)./squeeze(var_num, 1)
        pvalue = [ccdf(Chisq(1), m) for m in chisq]
        TAB = DataFrame(Variable = namesx, Estimate = cf, StdErr = squeeze(sqrt(var_num), 1), Chisq = chisq, p_value = pvalue)
    end
end




WEE_cts (generic function with 2 methods)

In [3]:
# Test function WEE_cts
using DataFrames, GLM, Distributions, Optim

# Input Data
x = DataFrame(x1 = rand(Binomial(2, 0.3), 3000), x2 = rand(Binomial(2, 0.3), 3000))
y1 = rand(Normal(0, 1), 3000)
D1 = vcat([0 for i = 1:1000], [1 for i = 1:2000])   #vcat(repmat([0], 1000), repmat([1], 2000)) /// repeat([1, 2, 3, 4], outer=[2])
y1d1 = DataFrame(y1 = y1, D1 = D1)

data = hcat(x, y1d1)
pd = 0.1
formu = y1 ~ x1 + x2
boot = 10
pd_pop = 0.1
D = D1;

WEE_cts(formu, D, data, pd_pop, boot)

Unnamed: 0,Variable,Estimate,StdErr,Chisq,p_value
1,x1,0.0680992265817286,0.0489149186386236,1.9382138300712568,0.1638626049436055
2,x2,0.0040673214344939,0.0336904746944316,0.0145748042467379,0.903907942653077
