In [2]:
using DataFrames

# To run this notebook, you need to have a data file available.
# You can either run the phenotype preprocessing scripts in ../preprocessing directory
# Or scp -r username@sherlock.stanford.edu:/scratch/PI/dpwall/DATA/phenotypes/jsonschema ../data

# Data
df = readtable("../data/all_samples_both_instruments_filtered.csv", nastrings=["None"])
samples = df[:, 1:1]
df = df[:, 2:end] # Remove identifier

m, n = size(df)

labels = readtable("../data/all_samples_both_instruments_filtered_labels.csv", nastrings=["None"])



Unnamed: 0,identifier,diagnosis,ADIR_diagnosis,ADIR_diagnosis_num_nulls,ADIR_communication,ADIR_social_interaction,ADIR_restricted_repetitive_behavior,ADOS_diagnosis,ADOS_diagnosis_num_nulls,ADOS_communication,ADOS_social_interaction,ADOS_restricted_repetitive_behavior,cpea_diagnosis,cpea_adjusted_diagnosis
1,AU000503,Autism,Autism,0,19,28,3,Autism Spectrum,0,0,5,5,Autism,Autism
2,AU000504,Autism,Autism,0,12,25,5,Autism,1,4,10,8,Autism,Autism
3,AU001203,Autism,Autism,0,20,29,4,Autism,0,5,11,5,Autism,Autism
4,AU001204,Autism,Autism,0,9,10,5,Autism Spectrum,0,1,6,0,Autism,Autism
5,AU001505,Autism,Autism,2,16,25,4,Autism,0,6,16,4,Autism,Autism
6,AU001703,Autism,Control,0,5,13,5,Autism,0,2,3,4,Aspergers,Aspergers
7,AU001704,Autism,Control,3,6,6,5,Autism,0,6,6,0,Control,Suspected Control
8,AU002105,Autism,Control,2,16,20,2,Autism,0,2,15,5,Control,Suspected Control
9,AU002106,Autism,Autism,3,11,26,4,Autism,0,6,14,5,Autism,Autism
10,AU002204,Autism,Autism,1,12,24,3,Autism,1,6,16,6,Autism,Autism


In [158]:
# Pull age and gender
more_data = readtable("../data/all_samples_both_instruments.csv", nastrings=["None"])
gender = more_data[:, :gender]
adir_age = more_data[:, :ADIR_age]
ados_age = more_data[:, :ADOS_age]
age = copy(adir_age)
age[isna(age)] = ados_age[isna(age)];


In [3]:
# Fill sparse matrix
m, n = size(df)
is = Array(Int, 0)
js = Array(Int, 0)
vs = Array(Int, 0)

for j=1:n
    for i=1:m
        if !isna(df[i,j])
            push!(is, i)
            push!(js, j)
            if(df[i, j] > 0)
                push!(vs, 1)
            else
                push!(vs, -1)
            end
        end
    end
end

p = size(is, 1)
all_data = sparse(is, js, vs, m, n);

In [4]:
# Split out testing data
test_indices = sample(1:p, ceil(Integer, 0.1 * p))
train_indices = setdiff(1:p, test_indices)

p_test = size(test_indices, 1)
p_train = size(train_indices, 1)

train_data = sparse(is[train_indices], js[train_indices], vs[train_indices], m, n);


In [175]:
using LowRankModels

losses = LogisticLoss()
#losses = QuadLoss()
k = 4
#rx = UnitOneSparseConstraint()
rx = [FixedLatentFeaturesConstraint(Array(float(age[!isna(age)]))), SimplexConstraint(),SimplexConstraint(), SimplexConstraint()]
ry = [ZeroReg(), ZeroReg(), ZeroReg(), ZeroReg()]
glrm = GLRM(train_data[!isna(age), :], losses, rx, ry, k, offset=false, scale=false);
#init_kmeanspp!(glrm);

X,Y,ch = fit!(glrm, verbose=true, max_iter=1000); # fit GLRM


LoadError: MethodError: no method matching LowRankModels.GLRM(::SparseMatrixCSC{Int64,Int64}, ::LowRankModels.LogisticLoss, ::Array{LowRankModels.Regularizer,1}, ::Array{LowRankModels.ZeroReg,1}, ::Int64; offset=false, scale=false)[0m
Closest candidates are:
  LowRankModels.GLRM(::Any, ::Any, ::Any, ::Any, ::Any, [1m[31m::Any[0m, [1m[31m::Any[0m, [1m[31m::Any[0m, [1m[31m::Any[0m) at /Users/kelley/.julia/v0.5/LowRankModels/src/glrm.jl:14[1m[31m got unsupported keyword arguments "offset", "scale"[0m
  LowRankModels.GLRM(::Any, ::LowRankModels.Loss, [1m[31m::LowRankModels.Regularizer[0m, ::Array{T,N}, ::Int64; kwargs...) at /Users/kelley/.julia/v0.5/LowRankModels/src/utilities/conveniencemethods.jl:59
  LowRankModels.GLRM(::AbstractArray{T,N}, [1m[31m::Array{Tuple{Int64,Int64},1}[0m, ::Any...; kwargs...) at deprecated.jl:49
  ...[0m

In [132]:
print(vecnorm(train_data - X.'*Y, 2))

966.8004989427108

In [133]:
approx = X.'*Y

#col_max, aindx = findmax(all_data, 1)

# Scale columns back up
#imputed = round(Int, approx.*(ones(m, 1)*scale_factors.'))
#imputed = round(Int, approx)

# Train confusion
train_ordinal_confusion = zeros(5, 5)
i_indices = is[train_indices]
j_indices = js[train_indices]

train_rmse = 0

for i=1:p_train
    i_index = i_indices[i]
    j_index = j_indices[i]
    #c = col_max[j_index]

    a_value = 0
    if all_data[i_index, j_index] > 0
        a_value = 1
    end
    i_value = 0
    if approx[i_index, j_index] > 0
        i_value = 1
    end
        
    #i_value = min(max(imputed[i_index, j_index], 0), c)
    
    train_ordinal_confusion[a_value+1, i_value+1] += 1 
    train_rmse += (a_value - i_value)^2
end

train_rmse = train_rmse/p_train

# Test confusion
test_ordinal_confusion = zeros(5, 5)
i_indices = is[test_indices]
j_indices = js[test_indices]

test_rmse = 0

for i=1:p_test
    i_index = i_indices[i]
    j_index = j_indices[i]
    #c = col_max[j_index]

    a_value = 0
    if all_data[i_index, j_index] > 0
        a_value = 1
    end
    i_value = 0
    if approx[i_index, j_index] > 0
        i_value = 1
    end
    
    test_ordinal_confusion[a_value+1, i_value+1] += 1 
    test_rmse += (a_value - i_value)^2 
end

test_rmse = test_rmse/p_test

append!(ks, k)
append!(k_train_error, train_rmse)
append!(k_test_error, test_rmse)

println(train_rmse)
println(test_rmse)

0.20487053847020112
0.2151877082341742


In [120]:
print(DataFrame(train_ordinal_confusion))

5×5 DataFrames.DataFrame
│ Row │ x1       │ x2       │ x3  │ x4  │ x5  │
├─────┼──────────┼──────────┼─────┼─────┼─────┤
│ 1   │ 336751.0 │ 193829.0 │ 0.0 │ 0.0 │ 0.0 │
│ 2   │ 109797.0 │ 576162.0 │ 0.0 │ 0.0 │ 0.0 │
│ 3   │ 0.0      │ 0.0      │ 0.0 │ 0.0 │ 0.0 │
│ 4   │ 0.0      │ 0.0      │ 0.0 │ 0.0 │ 0.0 │
│ 5   │ 0.0      │ 0.0      │ 0.0 │ 0.0 │ 0.0 │

In [121]:
print(DataFrame(test_ordinal_confusion))

5×5 DataFrames.DataFrame
│ Row │ x1      │ x2      │ x3  │ x4  │ x5  │
├─────┼─────────┼─────────┼─────┼─────┼─────┤
│ 1   │ 36813.0 │ 21973.0 │ 0.0 │ 0.0 │ 0.0 │
│ 2   │ 12141.0 │ 63537.0 │ 0.0 │ 0.0 │ 0.0 │
│ 3   │ 0.0     │ 0.0     │ 0.0 │ 0.0 │ 0.0 │
│ 4   │ 0.0     │ 0.0     │ 0.0 │ 0.0 │ 0.0 │
│ 5   │ 0.0     │ 0.0     │ 0.0 │ 0.0 │ 0.0 │

In [122]:
imputed = round(Int, approx)

for i=1:m
    for j=1:n
        if imputed[i, j] > 0
            imputed[i, j] = 1
        else
            imputed[i, j] = 0
        end
    end
end
new_df = convert(DataFrame, imputed)
names!(new_df, names(df))
new_df = hcat(samples, new_df)

Unnamed: 0,identifier,ADIR_Q11,ADIR_Q12,ADIR_Q13,ADIR_Q14,ADIR_Q15,ADIR_Q16,ADIR_Q18,ADIR_Q20,ADIR_Q21,ADIR_Q22,ADIR_Q23,ADIR_Q24,ADIR_Q25,ADIR_Q27,ADIR_Q29_1,ADIR_Q29_2,ADIR_Q30,ADIR_Q31_1,ADIR_Q31_2,ADIR_Q32_1,ADIR_Q32_2,ADIR_Q33_1,ADIR_Q33_2,ADIR_Q34_1,ADIR_Q34_2,ADIR_Q35_1,ADIR_Q35_2,ADIR_Q36_1,ADIR_Q36_2,ADIR_Q37_1,ADIR_Q37_2,ADIR_Q38_1,ADIR_Q38_2,ADIR_Q39_1,ADIR_Q39_2,ADIR_Q40_1,ADIR_Q40_2,ADIR_Q41_1,ADIR_Q41_2,ADIR_Q42_1,ADIR_Q42_2,ADIR_Q43_1,ADIR_Q43_2,ADIR_Q44_1,ADIR_Q44_2,ADIR_Q45_1,ADIR_Q45_2,ADIR_Q46_1,ADIR_Q46_2,ADIR_Q47_1,ADIR_Q47_2,ADIR_Q48_1,ADIR_Q48_2,ADIR_Q49_1,ADIR_Q49_2,ADIR_Q50_1,ADIR_Q50_2,ADIR_Q51_1,ADIR_Q51_2,ADIR_Q52_1,ADIR_Q52_2,ADIR_Q53_1,ADIR_Q53_2,ADIR_Q54_1,ADIR_Q54_2,ADIR_Q55_1,ADIR_Q55_2,ADIR_Q56_1,ADIR_Q56_2,ADIR_Q57_1,ADIR_Q57_2,ADIR_Q58_1,ADIR_Q58_2,ADIR_Q59_1,ADIR_Q59_2,ADIR_Q60_1,ADIR_Q60_2,ADIR_Q61_1,ADIR_Q61_2,ADIR_Q62_1,ADIR_Q62_2,ADIR_Q63_1,ADIR_Q63_2,ADIR_Q64_1,ADIR_Q64_2,ADIR_Q65_1,ADIR_Q65_2,ADIR_Q66_1,ADIR_Q66_2,ADIR_Q67_1,ADIR_Q67_2,ADIR_Q68_1,ADIR_Q68_2,ADIR_Q69_1,ADIR_Q69_2,ADIR_Q70_1,ADIR_Q70_2,ADIR_Q71_1,ADIR_Q71_2,ADIR_Q72_1,ADIR_Q72_2,ADIR_Q73_1,ADIR_Q73_2,ADIR_Q74_1,ADIR_Q74_2,ADIR_Q75_1,ADIR_Q75_2,ADIR_Q76_1,ADIR_Q76_2,ADIR_Q77_1,ADIR_Q77_2,ADIR_Q78_1,ADIR_Q78_2,ADIR_Q79_1,ADIR_Q79_2,ADIR_Q80_1,ADIR_Q80_2,ADIR_Q81_1,ADIR_Q81_2,ADIR_Q82_1,ADIR_Q82_2,ADIR_Q83_1,ADIR_Q83_2,ADIR_Q84_1,ADIR_Q84_2,ADIR_Q85_1,ADIR_Q85_2,ADIR_Q88_1,ADIR_Q88_2,ADIR_Q89_1,ADIR_Q89_2,ADIR_Q90_1,ADIR_Q90_2,ADIR_Q91_1,ADIR_Q91_2,ADIR_Q92_1,ADIR_Q92_2,ADIR_Q93_1,ADIR_Q93_2,ADOS_QA01,ADOS_QA02,ADOS_QA03,ADOS_QA04,ADOS_QA05,ADOS_QA06,ADOS_QA07,ADOS_QA08,ADOS_QA09,ADOS_QA10,ADOS_QA11,ADOS_QA12,ADOS_QA13,ADOS_QB01,ADOS_QB02,ADOS_QB03,ADOS_QB04,ADOS_QB05,ADOS_QB06,ADOS_QB07,ADOS_QB08,ADOS_QB09,ADOS_QB10,ADOS_QB11,ADOS_QB12,ADOS_QB13,ADOS_QB14,ADOS_QB15,ADOS_QB16,ADOS_QB17,ADOS_QB18,ADOS_QB19,ADOS_QB20,ADOS_QB21,ADOS_QB22,ADOS_QB23,ADOS_QC01,ADOS_QC02,ADOS_QD01,ADOS_QD02,ADOS_QD03,ADOS_QD04,ADOS_QD05,ADOS_QE01,ADOS_QE02,ADOS_QE03
1,AU000503,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
2,AU000504,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
3,AU001203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
4,AU001204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
5,AU001505,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
6,AU001703,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
7,AU001704,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
8,AU002105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
9,AU002106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
10,AU002204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0


In [128]:
# Evaluate clusters
print(mapslices(sum, X, 2))

[325.788; 312.305; 773.505; 562.331; 1380.78; 321.463; 2187.27; 3236.56]

In [134]:
writecsv("../data/softkmeans_quadloss_X$(k).csv", X)
writecsv("../data/softkmeans_quadloss_Y$(k).csv", Y)