# Load Data

In [1]:
using DataFrames

# To run this notebook, you need to have a data file available.
# You can either run the phenotype preprocessing scripts in ../preprocessing directory
# Or scp -r username@sherlock.stanford.edu:/scratch/PI/dpwall/DATA/phenotypes/jsonschema ../data

# Data
df = readtable("../data/all_samples_both_instruments_filtered.csv", nastrings=["None"])
samples = df[:, 1:1]
df = df[:, 2:end] # Remove identifier

# Binarize Data
[df[df[nm].> 0, nm] = 1 for nm in names(df)]
[df[df[nm].== 0, nm] = -1 for nm in names(df)]
[df[isna(df[nm]), nm] = 0 for nm in names(df)]

m, n = size(df)




(9100,185)

In [2]:
# Form sparse array
all_data = sparse(Array(df))
p = size(nonzeros(all_data), 1)

1344633

In [6]:
# Split out testing data
all_indices = 1:p
held_out_test_indices = sample(all_indices, ceil(Integer, 0.1 * p))
other_indices = setdiff(all_indices, held_out_test_indices)
test_indices = sample(other_indices, ceil(Integer, 0.05 * p))
train_indices = setdiff(other_indices, test_indices)

train_data = copy(all_data)
nonzeros(train_data)[test_indices] = 0
dropzeros!(train_data)

test_data = copy(all_data)
nonzeros(test_data)[train_indices] = 0
dropzeros!(test_data)

println(size(held_out_test_indices))
println(size(test_indices))
println(size(train_indices))

(134464,)
(67232,)
(1151283,)


In [7]:
adir_indices = 1:139
ados_indices = 140:n

140:185

# Both instruments

## Train

In [None]:
using LowRankModels

for k=16:25
    losses = LogisticLoss()
    rx = ZeroReg()
    ry = ZeroReg()
    glrm = GLRM(train_data, losses, rx, ry, k, offset=false, scale=false);
    init_svd!(glrm);

    X,Y,ch = fit!(glrm, verbose=true, max_iter=1000); # fit GLRM
    push!(Xs, X)
    push!(Ys, Y)
end


LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 405456.1512174418
Iteration 20: objective value = 369285.6046747477
Iteration 30: objective value = 352511.0790024653
Iteration 40: objective value = 340637.8285164097
obj went up to 341357.68237954896; reducing step size to 4.927992098487262
obj went up to 339983.59112894326; reducing step size to 3.449594468941083
obj went up to 338636.6256845631; reducing step size to 2.535451934671696
Iteration 50: objective value = 336797.32739381364
obj went up to 336523.5536409644; reducing step size to 2.1573003712176275
Iteration 60: objective value = 334009.2009904441
obj went up to 333982.16361590364; reducing step size to 2.124876784895767
Iteration 70: objective value = 331909.8877313254
obj went up to 332014.84834226756; reducing step size to 2.092940515483502
obj went up to 330298.16004879447; reducing step size to 2.061484238751859
Iteration 80: objective value = 330091.6795124915
obj 

## Evaluate

In [17]:
for l=1:15
    println(l)
    approx = Xs[l].'*Ys[l]
    approx[approx.>0] = 1
    approx[approx.<0] = -1
    approx = trunc(Int, approx)

    adir_train_confusion = zeros(Int, 3, 3)
    adir_test_confusion = zeros(Int, 3, 3)
    ados_train_confusion = zeros(Int, 3, 3)
    ados_test_confusion = zeros(Int, 3, 3)

    for i=1:m
        for j=adir_indices
            if train_data[i, j] != 0
                adir_train_confusion[train_data[i, j]+2, approx[i, j]+2] += 1
            end
            if test_data[i, j] != 0
                adir_test_confusion[test_data[i, j]+2, approx[i, j]+2] += 1
            end
        end
        for j=ados_indices
            if train_data[i, j] != 0
                ados_train_confusion[train_data[i, j]+2, approx[i, j]+2] += 1
            end
            if test_data[i, j] != 0
                ados_test_confusion[test_data[i, j]+2, approx[i, j]+2] += 1
            end
        end
    end
    println("Error ", (adir_train_confusion[1, 3]+adir_train_confusion[3, 1]+ados_train_confusion[1, 3]+ados_train_confusion[3, 1])/(sum(adir_train_confusion)+sum(ados_train_confusion)), " ", 
                    (adir_test_confusion[1, 3]+adir_test_confusion[3, 1]+ados_test_confusion[1, 3]+ados_test_confusion[3, 1])/(sum(adir_test_confusion)+sum(ados_test_confusion)))

    println("ADIR ", (adir_train_confusion[1, 3]+adir_train_confusion[3, 1])/sum(adir_train_confusion), " ", 
                    (adir_test_confusion[1, 3]+adir_test_confusion[3, 1])/sum(adir_test_confusion))

    println("ADOS ", (ados_train_confusion[1, 3]+ados_train_confusion[3, 1])/sum(ados_train_confusion), " ", 
                    (ados_test_confusion[1, 3]+ados_test_confusion[3, 1])/sum(ados_test_confusion))
end

1
Error 0.24687904044075687 0.2482803206620119
ADIR 0.24531546274425187 0.2469402857630785
ADOS 0.2539418299751587 0.2542911453457032
2
Error 0.76132803805968165 0.21755365916731317
ADIR 0.21301397197965977 0.21609330626237327
ADOS 0.22026376621584323 0.22410417907907057
3
Error 0.1979208194776018 0.2030359451771399
ADIR 0.19499189407494114 0.20001517997760954
ADOS 0.21115097985095224 0.21658580872130961
4
Error 0.13291068231953936 0.19581587794155675
ADIR 0.18555312197459953 0.1915080675256004
ADOS 0.2095940863924924 0.21513887706755186
5
Error 0.1805927517131078 0.1878872510990432
ADIR 0.17588425235491922 0.18320967976572233
ADOS 0.20186137179133315 0.2088688399012682
6
Error 0.4933604116148943 0.18232221360227566
ADIR 0.1672799417977395 0.17671391434697634
ADOS 0.20082631796853437 0.20747865066530485
7
Error 0.16579115640834424 0.1756452029997414
ADIR 0.15958074513405807 0.1694338500850711
ADOS 0.19384401738890422 0.2035066814196953
8
Error 0.81375893294060445 0.16966640806826996
AD

In [None]:
println(DataFrame(adir_train_confusion))
println(DataFrame(adir_test_confusion))
println(DataFrame(ados_train_confusion))
println(DataFrame(ados_test_confusion))

## Output

In [None]:
imputed = copy(approx)
imputed[imputed.<0] = 0
new_df = convert(DataFrame, imputed)
names!(new_df, names(df))
new_df = hcat(samples, new_df)

writecsv("../data/impute_logloss_X$(k).csv", X)
writecsv("../data/impute_logloss_Y$(k).csv", Y)
writetable("../data/impute_logloss_Z$(k).csv", new_df, separator = ',', header = true)

# One Instrument

## Train

In [9]:
using LowRankModels

losses = LogisticLoss()
rx = ZeroReg()
ry = ZeroReg()
adir_k = 10
adir_glrm = GLRM(train_data[:, adir_indices], losses, rx, ry, adir_k, offset=false, scale=false);
init_svd!(adir_glrm);

adir_X,adir_Y,adir_ch = fit!(adir_glrm, verbose=true, max_iter=1000); # fit GLRM

LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 360112.2038498597
Iteration 20: objective value = 334810.3632918745
Iteration 30: objective value = 325426.56920161523
obj went up to 320702.56769596244; reducing step size to 4.469834102936292
Iteration 40: objective value = 319526.5349309807
obj went up to 319736.5653291172; reducing step size to 2.979889401957528
obj went up to 319043.70318705327; reducing step size to 2.299729645960723
Iteration 50: objective value = 317713.9403050886
obj went up to 317619.8104115932; reducing step size to 2.1573003712176284
Iteration 60: objective value = 316177.78639075166
obj went up to 316205.7111956746; reducing step size to 2.0236921760912074
obj went up to 314972.267280699; reducing step size to 1.9932766814128595
Iteration 70: objective value = 314937.87885173946
obj went up to 313921.5257084878; reducing step size to 1.963318322620818
Iteration 80: objective value = 313591.7913482437
obj 

In [10]:
losses = LogisticLoss()
rx = ZeroReg()
ry = ZeroReg()
ados_k = 4
ados_glrm = GLRM(train_data[:, ados_indices], losses, rx, ry, ados_k, offset=false, scale=false);
init_svd!(ados_glrm);

ados_X,ados_Y,ados_ch = fit!(ados_glrm, verbose=true, max_iter=1000); # fit GLRM

LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 82562.5832533424
obj went up to 82787.71324869443; reducing step size to 1.0859297511849613
obj went up to 77789.04929535909; reducing step size to 0.923968079738386
Iteration 20: objective value = 76113.58043898537
obj went up to 75316.501524221; reducing step size to 0.9100811128645369
Iteration 30: objective value = 74344.20534469547
obj went up to 74156.07307360631; reducing step size to 0.8964028629942123
Iteration 40: objective value = 73359.78794341543
obj went up to 73285.52201762024; reducing step size to 0.8408858982596193
Iteration 50: objective value = 72694.17986696522
obj went up to 72704.66986005065; reducing step size to 0.8696600135923916
Iteration 60: objective value = 72237.5952119492
obj went up to 72247.19610319774; reducing step size to 0.81579931512873
Iteration 70: objective value = 71846.71539802798
obj went up to 71875.46187666628; reducing step size to 0.843

## Evaluate

In [11]:
adir_approx = adir_X.'*adir_Y
adir_approx[adir_approx.>0] = 1
adir_approx[adir_approx.<=0] = -1
adir_approx = trunc(Int, adir_approx)

ados_approx = ados_X.'*ados_Y
ados_approx[ados_approx.>0] = 1
ados_approx[ados_approx.<=0] = -1
ados_approx = trunc(Int, ados_approx)

adir_train_confusion = zeros(Int, 3, 3)
adir_test_confusion = zeros(Int, 3, 3)
ados_train_confusion = zeros(Int, 3, 3)
ados_test_confusion = zeros(Int, 3, 3)

for i=1:m
    for j=adir_indices
        if train_data[i, j] != 0
            adir_train_confusion[train_data[i, j]+2, adir_approx[i, j]+2] += 1
        end
        if test_data[i, j] != 0
            adir_test_confusion[test_data[i, j]+2, adir_approx[i, j]+2] += 1
        end
    end
    for j=ados_indices
        if train_data[i, j] != 0
            ados_train_confusion[train_data[i, j]+2, ados_approx[i, j-139]+2] += 1
        end
        if test_data[i, j] != 0
            ados_test_confusion[test_data[i, j]+2, ados_approx[i, j-139]+2] += 1
        end
    end
end
println("ADIR ", (adir_train_confusion[1, 3]+adir_train_confusion[3, 1])/sum(adir_train_confusion), " ", 
                (adir_test_confusion[1, 3]+adir_test_confusion[3, 1])/sum(adir_test_confusion))

println("ADOS ", (ados_train_confusion[1, 3]+ados_train_confusion[3, 1])/sum(ados_train_confusion), " ", 
                (ados_test_confusion[1, 3]+ados_test_confusion[3, 1])/sum(ados_test_confusion))


ADIR 0.1312128717125175 0.14551273536871534
ADOS 0.14272960943969087 0.17232672284166028


In [12]:
println(DataFrame(adir_train_confusion))
println(DataFrame(adir_test_confusion))
println(DataFrame(ados_train_confusion))
println(DataFrame(ados_test_confusion))

3×3 DataFrames.DataFrame
│ Row │ x1     │ x2 │ x3     │
├─────┼────────┼────┼────────┤
│ 1   │ 397163 │ 0  │ 73446  │
│ 2   │ 0      │ 0  │ 0      │
│ 3   │ 63984  │ 0  │ 512789 │
3×3 DataFrames.DataFrame
│ Row │ x1    │ x2 │ x3    │
├─────┼───────┼────┼───────┤
│ 1   │ 59014 │ 0  │ 12251 │
│ 2   │ 0     │ 0  │ 0     │
│ 3   │ 10755 │ 0  │ 76083 │
3×3 DataFrames.DataFrame
│ Row │ x1    │ x2 │ x3     │
├─────┼───────┼────┼────────┤
│ 1   │ 67694 │ 0  │ 19723  │
│ 2   │ 0     │ 0  │ 0      │
│ 3   │ 13372 │ 0  │ 131083 │
3×3 DataFrames.DataFrame
│ Row │ x1   │ x2 │ x3    │
├─────┼──────┼────┼───────┤
│ 1   │ 9833 │ 0  │ 3535  │
│ 2   │ 0    │ 0  │ 0     │
│ 3   │ 2539 │ 0  │ 19340 │
