# Load Data

In [2]:
using DataFrames

# To run this notebook, you need to have a data file available.
# You can either run the phenotype preprocessing scripts in ../preprocessing directory
# Or scp -r username@sherlock.stanford.edu:/scratch/PI/dpwall/DATA/phenotypes/jsonschema ../data

# Data
df = readtable("../data/all_samples_both_instruments_filtered.csv", nastrings=["None"])
samples = df[:, 1:1]
df = df[:, 2:end] # Remove identifier

# Binarize Data
[df[df[nm].> 0, nm] = 1 for nm in names(df)]
[df[df[nm].== 0, nm] = -1 for nm in names(df)]
[df[isna(df[nm]), nm] = 0 for nm in names(df)]

m, n = size(df)


(9100,123)

In [3]:
# Form sparse array
all_data = sparse(Array(df))
dropzeros!(all_data)
p = size(nonzeros(all_data), 1)

842629

In [13]:
# Split out testing data
all_indices = collect(1:p)
shuffle!(all_indices)
break1, break2 = ceil(Integer, 0.85 * p), ceil(Integer, 0.9 * p)
train_indices = view(all_indices, 1:(break1-1))
test_indices = view(all_indices, break1:(break2-1))
held_out_test_indices = view(all_indices, break2:p)

train_data = copy(all_data)
nonzeros(train_data)[union(test_indices, held_out_test_indices)] = 0
dropzeros!(train_data)

test_data = copy(all_data)
nonzeros(test_data)[union(train_indices, held_out_test_indices)] = 0
dropzeros!(test_data)

heldout_data = copy(all_data)
nonzeros(heldout_data)[union(train_indices, test_indices)] = 0
dropzeros!(heldout_data)

println(size(held_out_test_indices), " ", size(nonzeros(heldout_data)))
println(size(test_indices), " ", size(nonzeros(test_data)))
println(size(train_indices), " ", size(nonzeros(train_data)))

(84263,) (84263,)
(42132,) (42132,)
(716234,) (716234,)


In [23]:
adir_indices = 1:77
ados_indices = 78:n

78:123

# Both instruments

In [15]:
using LowRankModels

Xs = []
Ys = []
for k=1:20
    losses = LogisticLoss()
    rx = ZeroReg()
    ry = ZeroReg()
    glrm = GLRM(train_data, losses, rx, ry, k, offset=false, scale=false);
    init_svd!(glrm);

    X,Y,ch = fit!(glrm, verbose=true, max_iter=1000); # fit GLRM
    push!(Xs, X)
    push!(Ys, Y)
end


LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 374636.7819668768
Iteration 20: objective value = 342629.6754019235
Iteration 30: objective value = 340613.38996484136
LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 354266.4125830003
Iteration 20: objective value = 321074.6808168964
Iteration 30: objective value = 309742.3266525956
obj went up to 308583.35574236995; reducing step size to 3.3354590280225285
obj went up to 307271.09102265345; reducing step size to 2.4515623855965587
Iteration 40: objective value = 305349.9358514492
obj went up to 305389.43825500546; reducing step size to 1.9865929346383517
Iteration 50: objective value = 303871.00480225484
obj went up to 303984.1646299981; reducing step size to 1.9567350305828821
obj went up to 303233.5563095833; reducing step size to 1.8355484590396431
Iteration 60: objective value = 303099.6101034255
LowRankModels.SparseProxGrad

Iteration 70: objective value = 201913.9602505535
obj went up to 201361.63615353667; reducing step size to 1.8698269739245885
Iteration 80: objective value = 200879.62528041724
obj went up to 200570.31603651828; reducing step size to 1.8417240277123421
Iteration 90: objective value = 200005.19198634737
obj went up to 199827.74340899507; reducing step size to 1.814043460467146
Iteration 100: objective value = 199213.37282498216
LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 226357.57132535146
Iteration 20: objective value = 208278.68429058726
Iteration 30: objective value = 200930.00404781214
obj went up to 198408.83562346164; reducing step size to 3.8612107573145806
obj went up to 198117.0266390342; reducing step size to 2.8379899066262166
Iteration 40: objective value = 197621.28007936713
obj went up to 197086.74842475436; reducing step size to 2.190218710438783
Iteration 50: objective value = 195576.01438077158
obj went up to 19

obj went up to 137994.6414370498; reducing step size to 1.9932766814128593
Iteration 70: objective value = 137929.15021876345
obj went up to 136607.50549872624; reducing step size to 1.9633183226208175
Iteration 80: objective value = 136093.9150517302
LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 186478.06121149607
Iteration 20: objective value = 160309.4396386991
Iteration 30: objective value = 145830.78406058665
Iteration 40: objective value = 135739.45602139938
obj went up to 140711.27020663393; reducing step size to 4.927992098487262
obj went up to 136654.24840504717; reducing step size to 3.2853280656581743
obj went up to 134877.1160970825; reducing step size to 2.414716128258758
Iteration 50: objective value = 132400.08841132492
obj went up to 132345.45328149764; reducing step size to 2.1573003712176275
Iteration 60: objective value = 130035.30505031835
obj went up to 130221.50234373136; reducing step size to 2.124876784895

## Evaluate

In [45]:
error = []
adir_error = []
ados_error = []
for l=1:20
    println(l)
    approx = Xs[l].'*Ys[l]
    approx[approx.>=0] = 1
    approx[approx.<0] = -1
    approx = trunc(Int, approx)

    adir_train_confusion = zeros(Int, 3, 3)
    adir_test_confusion = zeros(Int, 3, 3)
    ados_train_confusion = zeros(Int, 3, 3)
    ados_test_confusion = zeros(Int, 3, 3)

    for i=1:m
        for j=adir_indices
            if train_data[i, j] != 0
                adir_train_confusion[train_data[i, j]+2, approx[i, j]+2] += 1
            end
            if test_data[i, j] != 0
                adir_test_confusion[test_data[i, j]+2, approx[i, j]+2] += 1
            end
        end
        for j=ados_indices
            if train_data[i, j] != 0
                ados_train_confusion[train_data[i, j]+2, approx[i, j]+2] += 1
            end
            if test_data[i, j] != 0
                ados_test_confusion[test_data[i, j]+2, approx[i, j]+2] += 1
            end
        end
    end
    push!(error, (adir_test_confusion[1, 3]+adir_test_confusion[3, 1]+ados_test_confusion[1, 3]+ados_test_confusion[3, 1])/(sum(adir_test_confusion)+sum(ados_test_confusion)))
    println("Error ", (adir_train_confusion[1, 3]+adir_train_confusion[3, 1]+ados_train_confusion[1, 3]+ados_train_confusion[3, 1])/(sum(adir_train_confusion)+sum(ados_train_confusion)), " ", 
                    (adir_test_confusion[1, 3]+adir_test_confusion[3, 1]+ados_test_confusion[1, 3]+ados_test_confusion[3, 1])/(sum(adir_test_confusion)+sum(ados_test_confusion)))

    push!(adir_error, (adir_test_confusion[1, 3]+adir_test_confusion[3, 1])/sum(adir_test_confusion))
    println("ADIR ", (adir_train_confusion[1, 3]+adir_train_confusion[3, 1])/sum(adir_train_confusion), " ", 
                    (adir_test_confusion[1, 3]+adir_test_confusion[3, 1])/sum(adir_test_confusion))

    push!(ados_error, (ados_test_confusion[1, 3]+ados_test_confusion[3, 1])/sum(ados_test_confusion))
    println("ADOS ", (ados_train_confusion[1, 3]+ados_train_confusion[3, 1])/sum(ados_train_confusion), " ", 
                    (ados_test_confusion[1, 3]+ados_test_confusion[3, 1])/sum(ados_test_confusion))
    println(DataFrame(adir_train_confusion))
    println(DataFrame(adir_test_confusion))
    println(DataFrame(ados_train_confusion))
    println(DataFrame(ados_test_confusion))
end

1
Error 0.23972193445158985 0.23988892053546
ADIR 0.2340545533152537 0.2352291736366678
ADOS 0.25362870948142 0.2512661329848064
3×3 DataFrames.DataFrame
│ Row │ x1     │ x2 │ x3     │
├─────┼────────┼────┼────────┤
│ 1   │ 158022 │ 0  │ 76533  │
│ 2   │ 0      │ 0  │ 0      │
│ 3   │ 42568  │ 0  │ 231737 │
3×3 DataFrames.DataFrame
│ Row │ x1   │ x2 │ x3    │
├─────┼──────┼────┼───────┤
│ 1   │ 9310 │ 0  │ 4534  │
│ 2   │ 0    │ 0  │ 0     │
│ 3   │ 2497 │ 0  │ 13549 │
3×3 DataFrames.DataFrame
│ Row │ x1    │ x2 │ x3     │
├─────┼───────┼────┼────────┤
│ 1   │ 44321 │ 0  │ 33888  │
│ 2   │ 0     │ 0  │ 0      │
│ 3   │ 18708 │ 0  │ 110457 │
3×3 DataFrames.DataFrame
│ Row │ x1   │ x2 │ x3   │
├─────┼──────┼────┼──────┤
│ 1   │ 2689 │ 0  │ 1949 │
│ 2   │ 0    │ 0  │ 0    │
│ 3   │ 1127 │ 0  │ 6477 │
2
Error 0.70544961562841194 0.21261748789518656
ADIR 0.20267067562787408 0.210270993643359
ADOS 0.21226865470116793 0.2183466753798399
3×3 DataFrames.DataFrame
│ Row │ x1     │ x2 │ x3     │


Error 0.12116291603023593 0.20153327636950535
ADIR 0.11921746649373109 0.1946135831381733
ADOS 0.1259367133777619 0.21842836137885965
3×3 DataFrames.DataFrame
│ Row │ x1     │ x2 │ x3     │
├─────┼────────┼────┼────────┤
│ 1   │ 201523 │ 0  │ 33032  │
│ 2   │ 0      │ 0  │ 0      │
│ 3   │ 27633  │ 0  │ 246672 │
3×3 DataFrames.DataFrame
│ Row │ x1    │ x2 │ x3    │
├─────┼───────┼────┼───────┤
│ 1   │ 10811 │ 0  │ 3033  │
│ 2   │ 0     │ 0  │ 0     │
│ 3   │ 2784  │ 0  │ 13262 │
3×3 DataFrames.DataFrame
│ Row │ x1    │ x2 │ x3     │
├─────┼───────┼────┼────────┤
│ 1   │ 62650 │ 0  │ 15559  │
│ 2   │ 0     │ 0  │ 0      │
│ 3   │ 10557 │ 0  │ 118608 │
3×3 DataFrames.DataFrame
│ Row │ x1   │ x2 │ x3   │
├─────┼──────┼────┼──────┤
│ 1   │ 3163 │ 0  │ 1475 │
│ 2   │ 0    │ 0  │ 0    │
│ 3   │ 1199 │ 0  │ 6405 │
12
Error 0.14550866616217605 0.20352701034842874
ADIR 0.11218802814133554 0.19782535965205755
ADOS 0.12020311128685371 0.21744812939062244
3×3 DataFrames.DataFrame
│ Row │ x1     │ 

In [43]:
using Plots
plotly() # Choose the Plotly.jl backend for web interactivity
labels = Array{String}(1, 3)
labels[1] = "All"
labels[2] = "ADI-R"
labels[3] = "ADOS"
plot([error, adir_error, ados_error],linewidth=2,title="Error", label=labels)


## Output

In [None]:
imputed = copy(approx)
imputed[imputed.<0] = 0
new_df = convert(DataFrame, imputed)
names!(new_df, names(df))
new_df = hcat(samples, new_df)

writecsv("../data/impute_logloss_X$(k).csv", X)
writecsv("../data/impute_logloss_Y$(k).csv", Y)
writetable("../data/impute_logloss_Z$(k).csv", new_df, separator = ',', header = true)

# ADIR One Instrument

In [46]:
using LowRankModels

adir_Xs = []
adir_Ys = []
for k=1:10
    losses = LogisticLoss()
    rx = ZeroReg()
    ry = ZeroReg()
    glrm = GLRM(train_data[:, adir_indices], losses, rx, ry, k, offset=false, scale=false);
    init_svd!(glrm);

    X,Y,ch = fit!(glrm, verbose=true, max_iter=1000); # fit GLRM
    push!(adir_Xs, X)
    push!(adir_Ys, Y)
end

LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 257603.34785319492
Iteration 20: objective value = 236794.94583655836
Iteration 30: objective value = 234416.25892768742
obj went up to 234725.79177683478; reducing step size to 3.0253596626054677
obj went up to 234486.8265468561; reducing step size to 2.0169064417369786
obj went up to 234356.9410162527; reducing step size to 1.4824262346766792
Iteration 40: objective value = 234264.68653724267
LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 223764.67114214314
Iteration 20: objective value = 208227.70658504675
Iteration 30: objective value = 204394.76743465156
obj went up to 204151.6498364244; reducing step size to 3.1766276457357416
obj went up to 203921.77410372847; reducing step size to 2.3348213196157706
Iteration 40: objective value = 203638.7105621416
obj went up to 203730.96936101894; reducing step size to 1.891993271084144

LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 148927.76473845137
Iteration 20: objective value = 134948.53840305237
Iteration 30: objective value = 129030.03161154121
obj went up to 128630.99287252646; reducing step size to 3.502231979423655
obj went up to 128126.9755057542; reducing step size to 2.4515623855965587
Iteration 40: objective value = 126907.60243999437
obj went up to 127108.64174513775; reducing step size to 1.8919932710841445
obj went up to 125622.50455336345; reducing step size to 1.6903012897811307
Iteration 50: objective value = 125137.68074760669
obj went up to 124546.53740448641; reducing step size to 1.6648965614872042
Iteration 60: objective value = 124074.26582686418
obj went up to 123706.0222029647; reducing step size to 1.6398736587433085
Iteration 70: objective value = 123170.98138032912
obj went up to 122966.74496322681; reducing step size to 1.615226842824393
Iteration 80: objective value = 122391.74941

In [49]:
error = []
for l=1:10
    println(l)
    approx = adir_Xs[l].'*adir_Ys[l]
    approx[approx.>=0] = 1
    approx[approx.<0] = -1
    approx = trunc(Int, approx)

    adir_train_confusion = zeros(Int, 3, 3)
    adir_test_confusion = zeros(Int, 3, 3)

    for i=1:m
        for j=adir_indices
            if train_data[i, j] != 0
                adir_train_confusion[train_data[i, j]+2, approx[i, j]+2] += 1
            end
            if test_data[i, j] != 0
                adir_test_confusion[test_data[i, j]+2, approx[i, j]+2] += 1
            end
        end
    end
    push!(error, (adir_test_confusion[1, 3]+adir_test_confusion[3, 1])/(sum(adir_test_confusion)))
    println("Error ", (adir_train_confusion[1, 3]+adir_train_confusion[3, 1])/(sum(adir_train_confusion)), " ", 
                    (adir_test_confusion[1, 3]+adir_test_confusion[3, 1])/(sum(adir_test_confusion)))

    println(DataFrame(adir_train_confusion))
    println(DataFrame(adir_test_confusion))
end

1
Error 0.23481507683842315 0.23566410170625626
3×3 DataFrames.DataFrame
│ Row │ x1     │ x2 │ x3     │
├─────┼────────┼────┼────────┤
│ 1   │ 151489 │ 0  │ 83066  │
│ 2   │ 0      │ 0  │ 0      │
│ 3   │ 36422  │ 0  │ 237883 │
3×3 DataFrames.DataFrame
│ Row │ x1   │ x2 │ x3    │
├─────┼──────┼────┼───────┤
│ 1   │ 8922 │ 0  │ 4922  │
│ 2   │ 0    │ 0  │ 0     │
│ 3   │ 2122 │ 0  │ 13924 │
2
Error 0.54848009275635736 0.2081967213114754
3×3 DataFrames.DataFrame
│ Row │ x1     │ x2 │ x3     │
├─────┼────────┼────┼────────┤
│ 1   │ 178869 │ 0  │ 55686  │
│ 2   │ 0      │ 0  │ 0      │
│ 3   │ 43786  │ 0  │ 230519 │
3×3 DataFrames.DataFrame
│ Row │ x1    │ x2 │ x3    │
├─────┼───────┼────┼───────┤
│ 1   │ 10324 │ 0  │ 3520  │
│ 2   │ 0     │ 0  │ 0     │
│ 3   │ 2703  │ 0  │ 13343 │
3
Error 0.17578312305938765 0.2006022080963533
3×3 DataFrames.DataFrame
│ Row │ x1     │ x2 │ x3     │
├─────┼────────┼────┼────────┤
│ 1   │ 184789 │ 0  │ 49766  │
│ 2   │ 0      │ 0  │ 0      │
│ 3   │ 39683 

In [50]:
using Plots
plotly() # Choose the Plotly.jl backend for web interactivity
plot(error,linewidth=2,title="Error", label="ADIR")

## ADOS One Instrument

In [51]:
using LowRankModels

ados_Xs = []
ados_Ys = []
for k=1:10
    losses = LogisticLoss()
    rx = ZeroReg()
    ry = ZeroReg()
    glrm = GLRM(train_data[:, ados_indices], losses, rx, ry, k, offset=false, scale=false);
    init_svd!(glrm);

    X,Y,ch = fit!(glrm, verbose=true, max_iter=1000); # fit GLRM
    push!(ados_Xs, X)
    push!(ados_Ys, Y)
end

LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
obj went up to 122569.93184076496; reducing step size to 0.8103375000000002
Iteration 10: objective value = 117489.17351385923
obj went up to 116443.79874913912; reducing step size to 0.7981583671209468
Iteration 20: objective value = 112784.14837983425
obj went up to 111955.74645020775; reducing step size to 0.8254703971560425
Iteration 30: objective value = 108854.5439168308
obj went up to 108526.6426251403; reducing step size to 0.8537170123754403
Iteration 40: objective value = 105524.53321496185
obj went up to 105426.52169833986; reducing step size to 0.8408858982596193
Iteration 50: objective value = 102668.79578254173
obj went up to 102662.03708656006; reducing step size to 0.8282476319927539
Iteration 60: objective value = 100528.6830033329
obj went up to 100591.55043253621; reducing step size to 0.81579931512873
obj went up to 99065.62433195238; reducing step size to 0.803538092784221
Iteration 70: objectiv

obj went up to 47864.38034776269; reducing step size to 1.3199543996262657
obj went up to 43417.68080342961; reducing step size to 1.0696085483071491
Iteration 20: objective value = 43280.18663350548
obj went up to 41211.26024256742; reducing step size to 1.0535326482798097
Iteration 30: objective value = 40200.82800011274
obj went up to 39482.003467725735; reducing step size to 0.9412230061439231
Iteration 40: objective value = 38483.41096392867
obj went up to 38305.916389874146; reducing step size to 0.9270767028312307
Iteration 50: objective value = 37425.19701185439
obj went up to 37359.17194401718; reducing step size to 0.8696600135923918
Iteration 60: objective value = 36627.93750509759
obj went up to 36633.22811335314; reducing step size to 0.8565892808851666
obj went up to 36043.38194241301; reducing step size to 0.803538092784221
Iteration 70: objective value = 36041.54210769016
obj went up to 35538.21413325719; reducing step size to 0.83103421063315
Iteration 80: objective va

In [53]:
error = []
for l=1:10
    println(l)
    approx = ados_Xs[l].'*ados_Ys[l]
    approx[approx.>=0] = 1
    approx[approx.<0] = -1
    approx = trunc(Int, approx)

    ados_train_confusion = zeros(Int, 3, 3)
    ados_test_confusion = zeros(Int, 3, 3)

    for i=1:m
        for j=ados_indices
            if train_data[i, j] != 0
                ados_train_confusion[train_data[i, j]+2, approx[i, j-77]+2] += 1
            end
            if test_data[i, j] != 0
                ados_test_confusion[test_data[i, j]+2, approx[i, j-77]+2] += 1
            end
        end
    end
    push!(error, (ados_test_confusion[1, 3]+ados_test_confusion[3, 1])/(sum(ados_test_confusion)))
    println("Error ", (ados_train_confusion[1, 3]+ados_train_confusion[3, 1])/(sum(ados_train_confusion)), " ", 
                    (ados_test_confusion[1, 3]+ados_test_confusion[3, 1])/(sum(ados_test_confusion)))

    println(DataFrame(ados_train_confusion))
    println(DataFrame(ados_test_confusion))
end

1
Error 0.25868720283159896 0.26760333278876003
3×3 DataFrames.DataFrame
│ Row │ x1    │ x2 │ x3     │
├─────┼───────┼────┼────────┤
│ 1   │ 36624 │ 0  │ 41585  │
│ 2   │ 0     │ 0  │ 0      │
│ 3   │ 12060 │ 0  │ 117105 │
3×3 DataFrames.DataFrame
│ Row │ x1   │ x2 │ x3   │
├─────┼──────┼────┼──────┤
│ 1   │ 2115 │ 0  │ 2523 │
│ 2   │ 0    │ 0  │ 0    │
│ 3   │ 753  │ 0  │ 6851 │
2
Error 0.1842998640138108 0.21328214344061427
3×3 DataFrames.DataFrame
│ Row │ x1    │ x2 │ x3     │
├─────┼───────┼────┼────────┤
│ 1   │ 55772 │ 0  │ 22437  │
│ 2   │ 0     │ 0  │ 0      │
│ 3   │ 15782 │ 0  │ 113383 │
3×3 DataFrames.DataFrame
│ Row │ x1   │ x2 │ x3   │
├─────┼──────┼────┼──────┤
│ 1   │ 3156 │ 0  │ 1482 │
│ 2   │ 0    │ 0  │ 0    │
│ 3   │ 1129 │ 0  │ 6475 │
3
Error 0.82843657353380847 0.22357457931710506
3×3 DataFrames.DataFrame
│ Row │ x1    │ x2 │ x3     │
├─────┼───────┼────┼────────┤
│ 1   │ 58008 │ 0  │ 20201  │
│ 2   │ 0     │ 0  │ 0      │
│ 3   │ 12862 │ 0  │ 116303 │
3×3 DataFram

In [54]:
using Plots
plotly() # Choose the Plotly.jl backend for web interactivity
plot(error,linewidth=2,title="Error", label="ADIR")