# Load Data

In [209]:
using DataFrames

# To run this notebook, you need to have a data file available.
# You can either run the phenotype preprocessing scripts in ../preprocessing directory
# Or scp -r username@sherlock.stanford.edu:/scratch/PI/dpwall/DATA/phenotypes/jsonschema ../data

# Data
df = readtable("../data/all_samples_both_instruments_filtered.csv", nastrings=["None"])
samples = df[:, 1:1]
df = df[:, 2:end] # Remove identifier

# Binarize Data
[df[df[nm].> 0, nm] = 1 for nm in names(df)]
[df[df[nm].== 0, nm] = -1 for nm in names(df)]
[df[isna(df[nm]), nm] = 0 for nm in names(df)]

m, n = size(df)

(9100,185)

In [210]:
# Form sparse array
all_data = sparse(Array(df))
p = size(nonzeros(all_data), 1)

1344633

In [211]:
# Split out testing data
test_indices = sample(1:m, ceil(Integer, 0.1 * m))
train_indices = setdiff(1:m, test_indices)

println(size(train_indices, 1))
println(size(test_indices, 1))

8242
910


In [212]:
adir_indices = 1:139
ados_indices = 140:n

140:185

# Impute Instruments Individually

In [214]:
using LowRankModels

losses = LogisticLoss()
rx = ZeroReg()
ry = ZeroReg()
adir_glrm = GLRM(all_data[:, adir_indices], losses, rx, ry, 10, offset=false, scale=false);
init_svd!(adir_glrm);

adir_X,adir_Y,adir_ch = fit!(adir_glrm, verbose=true, max_iter=1000); # fit GLRM

losses = LogisticLoss()
rx = ZeroReg()
ry = ZeroReg()
ados_glrm = GLRM(all_data[:, ados_indices], losses, rx, ry, 4, offset=false, scale=false);
init_svd!(ados_glrm);

ados_X,ados_Y,ados_ch = fit!(ados_glrm, verbose=true, max_iter=1000); # fit GLRM


LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 379730.1583595497
Iteration 20: objective value = 352322.0040552299
Iteration 30: objective value = 342362.4692918017
obj went up to 338293.15832549834; reducing step size to 4.256984859939325
obj went up to 336823.84104497655; reducing step size to 2.8379899066262166
Iteration 40: objective value = 336741.7007504101
obj went up to 335617.1268399346; reducing step size to 2.2997296459607224
Iteration 50: objective value = 333783.4520189714
obj went up to 333593.8530512194; reducing step size to 2.157300371217628
Iteration 60: objective value = 331814.4094287561
obj went up to 331781.0571111383; reducing step size to 2.1248767848957675
Iteration 70: objective value = 330185.3483572492
obj went up to 330259.3085184865; reducing step size to 2.092940515483502
obj went up to 328865.568957641; reducing step size to 2.061484238751859
Iteration 80: objective value = 328781.61845377827
obj we

In [217]:
adir_approx = adir_X.'*adir_Y
adir_approx[adir_approx.>0] = 1
adir_approx[adir_approx.<=0] = -1
adir_approx = trunc(Int, adir_approx)

ados_approx = ados_X.'*ados_Y
ados_approx[ados_approx.>0] = 1
ados_approx[ados_approx.<=0] = -1
ados_approx = trunc(Int, ados_approx)

# Replace imputed values with real values if we have them
adir_approx[all_data[:, adir_indices].>0] = 1
adir_approx[all_data[:, adir_indices].<=0] = -1
ados_approx[all_data[:, ados_indices].>0] = 1
ados_approx[all_data[:, ados_indices].<=0] = -1

-1

In [226]:
ados_to_adir_train = pinv(ados_approx[train_indices, :])*adir_approx[train_indices, :]
adir_to_ados_train = pinv(adir_approx[train_indices, :])*ados_approx[train_indices, :]


139×46 Array{Float64,2}:
 -0.0189539   -0.000157585  -0.00181525   …  -0.0225713   -0.0139361 
 -0.0044814   -0.0806329    -0.0195867        0.0623851    0.0488832 
  0.0164016    0.00994412    0.0298212        0.0328272   -0.0204802 
  0.0145034   -0.00441303   -0.0339411       -0.0276456   -0.0238158 
  0.0233978    0.00751977    0.0723778        0.0781523    0.05265   
 -0.0362412   -0.0494817     0.0197502    …  -0.0151447    0.0320931 
  0.0319933    0.0206305     0.028302         0.00692504   0.0588705 
  0.0420519    0.020026     -0.0310016       -0.0120137   -0.078894  
 -0.0828549   -0.038404      0.00774121       0.0136738    0.0667537 
 -0.00714297  -0.0299695     0.0217736       -0.015326     0.054881  
  0.0292375   -0.0703138     0.0282273    …   0.0241207    0.074505  
 -0.0204517    0.0264216    -0.00285534       0.00558389   0.0235107 
  0.0195634    0.00749351    0.0295057        0.0107086    0.0679171 
  ⋮                                       ⋱                ⋮     

# Train Instrument to Instrument

In [272]:
using Plots
plotly() # Choose the Plotly.jl backend for web interactivity


F = svdfact(adir_to_ados_train)
G = svdfact(ados_to_adir_train)
plot([F[:S], G[:S]], linewidth=2,title="Singular values")


In [253]:
using LowRankModels

losses = QuadLoss()
rx = ZeroReg()
ry = ZeroReg()
adir_to_ados_glrm = GLRM(adir_to_ados_train, losses, rx, ry, 10, offset=false, scale=false);
init_svd!(adir_to_ados_glrm);

adir_to_ados_glrm_X,adir_to_ados_glrm_Y,adir_to_ados_glrm_ch = fit!(adir_to_ados_glrm, verbose=true, max_iter=1000); # fit GLRM

losses = QuadLoss()
rx = ZeroReg()
ry = ZeroReg()
ados_to_adir_glrm = GLRM(ados_to_adir_train, losses, rx, ry, 10, offset=false, scale=false);
init_svd!(ados_to_adir_glrm);

ados_to_adir_glrm_X,ados_to_adir_glrm_Y,ados_to_adir_glrm_glrm_ch = fit!(ados_to_adir_glrm, verbose=true, max_iter=1000); # fit GLRM


Fitting GLRM
Iteration 10: objective value = 6.054745155730678
Iteration 20: objective value = 3.028441518776129
Iteration 30: objective value = 1.8226347656334478
Fitting GLRM
Iteration 10: objective value = 4.2362210450769915
Iteration 20: objective value = 2.5242257819489846


## Evaluate

In [262]:
adir_to_ados_train

139×46 Array{Float64,2}:
 -0.0189539   -0.000157585  -0.00181525   …  -0.0225713   -0.0139361 
 -0.0044814   -0.0806329    -0.0195867        0.0623851    0.0488832 
  0.0164016    0.00994412    0.0298212        0.0328272   -0.0204802 
  0.0145034   -0.00441303   -0.0339411       -0.0276456   -0.0238158 
  0.0233978    0.00751977    0.0723778        0.0781523    0.05265   
 -0.0362412   -0.0494817     0.0197502    …  -0.0151447    0.0320931 
  0.0319933    0.0206305     0.028302         0.00692504   0.0588705 
  0.0420519    0.020026     -0.0310016       -0.0120137   -0.078894  
 -0.0828549   -0.038404      0.00774121       0.0136738    0.0667537 
 -0.00714297  -0.0299695     0.0217736       -0.015326     0.054881  
  0.0292375   -0.0703138     0.0282273    …   0.0241207    0.074505  
 -0.0204517    0.0264216    -0.00285534       0.00558389   0.0235107 
  0.0195634    0.00749351    0.0295057        0.0107086    0.0679171 
  ⋮                                       ⋱                ⋮     

In [261]:
ados_to_adir_approx = ados_approx * ados_to_adir_glrm_X.'*ados_to_adir_glrm_Y
ados_to_adir_approx[ados_to_adir_approx.>0] = 1
ados_to_adir_approx[ados_to_adir_approx.<=0] = -1
ados_to_adir_approx = trunc(Int, ados_to_adir_approx)

#adir_to_ados_approx = adir_approx * adir_to_ados_glrm_X.'*adir_to_ados_glrm_Y
#adir_to_ados_approx = adir_approx * F[:U][:, 1:15] * diagm(F[:S][1:15]) * F[:Vt][1:15, :]
adir_to_ados_approx = adir_approx * adir_to_ados_train
adir_to_ados_approx[adir_to_ados_approx.>0] = 1
adir_to_ados_approx[adir_to_ados_approx.<=0] = -1
adir_to_ados_approx = trunc(Int, adir_to_ados_approx)

ados_to_adir_train_confusion = zeros(Int, 3, 3)
ados_to_adir_test_confusion = zeros(Int, 3, 3)
adir_to_ados_train_confusion = zeros(Int, 3, 3)
adir_to_ados_test_confusion = zeros(Int, 3, 3)

for i=train_indices
    for j=adir_indices
        if all_data[i, j] != 0
            ados_to_adir_train_confusion[all_data[i, j]+2, ados_to_adir_approx[i, j]+2] += 1
        end
    end
    for j=ados_indices
        if all_data[i, j] != 0
            adir_to_ados_train_confusion[all_data[i, j]+2, adir_to_ados_approx[i, j-139]+2] += 1
        end
    end
end
for i=test_indices
    for j=adir_indices
        if all_data[i, j] != 0
            ados_to_adir_test_confusion[all_data[i, j]+2, ados_to_adir_approx[i, j]+2] += 1
        end
    end
    for j=ados_indices
        if all_data[i, j] != 0
            adir_to_ados_test_confusion[all_data[i, j]+2, adir_to_ados_approx[i, j-139]+2] += 1
        end
    end
end
println("ADIR ", (ados_to_adir_train_confusion[1, 3]+ados_to_adir_train_confusion[3, 1])/sum(ados_to_adir_train_confusion), " ", 
                (ados_to_adir_test_confusion[1, 3]+ados_to_adir_test_confusion[3, 1])/sum(ados_to_adir_test_confusion))

println("ADOS ", (adir_to_ados_train_confusion[1, 3]+adir_to_ados_train_confusion[3, 1])/sum(adir_to_ados_train_confusion), " ", 
                (adir_to_ados_test_confusion[1, 3]+adir_to_ados_test_confusion[3, 1])/sum(adir_to_ados_test_confusion))


ADIR 0.24191260017193802 0.2387877564502514
ADOS 0.25648614524077823 0.2588792748248867


In [243]:
println(DataFrame(ados_to_adir_train_confusion))
println(DataFrame(ados_to_adir_test_confusion))
println(DataFrame(adir_to_ados_train_confusion))
println(DataFrame(adir_to_ados_test_confusion))

3×3 DataFrames.DataFrame
│ Row │ x1     │ x2 │ x3     │
├─────┼────────┼────┼────────┤
│ 1   │ 320707 │ 0  │ 128503 │
│ 2   │ 0      │ 0  │ 0      │
│ 3   │ 106717 │ 0  │ 442107 │
3×3 DataFrames.DataFrame
│ Row │ x1    │ x2 │ x3    │
├─────┼───────┼────┼───────┤
│ 1   │ 34328 │ 0  │ 13791 │
│ 2   │ 0     │ 0  │ 0     │
│ 3   │ 11562 │ 0  │ 49307 │
3×3 DataFrames.DataFrame
│ Row │ x1    │ x2 │ x3     │
├─────┼───────┼────┼────────┤
│ 1   │ 58454 │ 0  │ 25094  │
│ 2   │ 0     │ 0  │ 0      │
│ 3   │ 33039 │ 0  │ 104384 │
3×3 DataFrames.DataFrame
│ Row │ x1   │ x2 │ x3    │
├─────┼──────┼────┼───────┤
│ 1   │ 6252 │ 0  │ 2691  │
│ 2   │ 0    │ 0  │ 0     │
│ 3   │ 3697 │ 0  │ 11630 │
