In [83]:
using DataFrames

# To run this notebook, you need to have a data file available.
# You can either run the phenotype preprocessing scripts in ../preprocessing directory
# Or scp -r username@sherlock.stanford.edu:/scratch/PI/dpwall/DATA/phenotypes/jsonschema ../data

# Data
df = readtable("../data/all_samples_filtered.csv", nastrings=["None", ""])
samples = df[:, 1:1]
df = df[:, 2:end] # Remove identifier

m, n = size(df)

labels = readtable("../data/all_samples_filtered_labels.csv", nastrings=["None", ""])

println(m, " ", n)

13434 123


In [84]:
# Pull age and gender
more_data = readtable("../data/all_samples.csv", nastrings=["None"])
gender = more_data[:, :gender]
adir_age = more_data[:, :ADIR_age]
ados_age = more_data[:, :ADOS_age]
age = copy(adir_age)
age[isna(age)] = ados_age[isna(age)];


In [85]:
# Fill sparse matrix
m, n = size(df)
is = Array(Int, 0)
js = Array(Int, 0)
vs = Array(Int, 0)

for j=1:n
    for i=1:m
        if !isna(df[i,j])
            push!(is, i)
            push!(js, j)
            if(df[i, j] > 0)
                push!(vs, 1)
            else
                push!(vs, -1)
            end
        end
    end
end

p = size(is, 1)
all_data = sparse(is, js, vs, m, n);

In [86]:
# Split out testing data
test_indices = sample(1:p, ceil(Integer, 0.1 * p))
train_indices = setdiff(1:p, test_indices)

p_test = size(test_indices, 1)
p_train = size(train_indices, 1)

train_data = sparse(is[train_indices], js[train_indices], vs[train_indices], m, n);


In [87]:
using LowRankModels

losses = LogisticLoss()
#losses = QuadLoss()
k = 3
#rx = UnitOneSparseConstraint()
rx = SimplexConstraint()
ry = ZeroReg()
glrm = GLRM(train_data, losses, rx, ry, k, offset=false, scale=false);
#init_kmeanspp!(glrm);

X,Y,ch = fit!(glrm, verbose=true, max_iter=5000); # fit GLRM


LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 472994.4945717246
Iteration 20: objective value = 426271.8396160792
Iteration 30: objective value = 400857.2406556693
obj went up to 395351.57622249756; reducing step size to 3.502231979423655
obj went up to 395030.6925058587; reducing step size to 2.4515623855965587
obj went up to 394340.1545156791; reducing step size to 1.7160936699175913
Iteration 40: objective value = 393132.229863025
obj went up to 392308.586887639; reducing step size to 1.3906150542468463
Iteration 50: objective value = 390050.6638999463
obj went up to 390146.06046331045; reducing step size to 1.2423714479891312


In [88]:
print(vecnorm(train_data - X.'*Y, 2))

2196.3546730301873

In [89]:
approx = X.'*Y

#col_max, aindx = findmax(all_data, 1)

# Scale columns back up
#imputed = round(Int, approx.*(ones(m, 1)*scale_factors.'))
#imputed = round(Int, approx)

# Train confusion
train_ordinal_confusion = zeros(5, 5)
i_indices = is[train_indices]
j_indices = js[train_indices]

train_rmse = 0

for i=1:p_train
    i_index = i_indices[i]
    j_index = j_indices[i]
    #c = col_max[j_index]

    a_value = 0
    if all_data[i_index, j_index] > 0
        a_value = 1
    end
    i_value = 0
    if approx[i_index, j_index] > 0
        i_value = 1
    end
        
    #i_value = min(max(imputed[i_index, j_index], 0), c)
    
    train_ordinal_confusion[a_value+1, i_value+1] += 1 
    train_rmse += (a_value - i_value)^2
end

train_rmse = train_rmse/p_train

# Test confusion
test_ordinal_confusion = zeros(5, 5)
i_indices = is[test_indices]
j_indices = js[test_indices]

test_rmse = 0

for i=1:p_test
    i_index = i_indices[i]
    j_index = j_indices[i]
    #c = col_max[j_index]

    a_value = 0
    if all_data[i_index, j_index] > 0
        a_value = 1
    end
    i_value = 0
    if approx[i_index, j_index] > 0
        i_value = 1
    end
    
    test_ordinal_confusion[a_value+1, i_value+1] += 1 
    test_rmse += (a_value - i_value)^2 
end

test_rmse = test_rmse/p_test

println(train_rmse)
println(test_rmse)

0.1977768088265326
0.21165141647021218


In [58]:
print(DataFrame(train_ordinal_confusion))

5×5 DataFrames.DataFrame
│ Row │ x1       │ x2       │ x3  │ x4  │ x5  │
├─────┼──────────┼──────────┼─────┼─────┼─────┤
│ 1   │ 237139.0 │ 95736.0  │ 0.0 │ 0.0 │ 0.0 │
│ 2   │ 74597.0  │ 355045.0 │ 0.0 │ 0.0 │ 0.0 │
│ 3   │ 0.0      │ 0.0      │ 0.0 │ 0.0 │ 0.0 │
│ 4   │ 0.0      │ 0.0      │ 0.0 │ 0.0 │ 0.0 │
│ 5   │ 0.0      │ 0.0      │ 0.0 │ 0.0 │ 0.0 │

In [59]:
print(DataFrame(test_ordinal_confusion))

5×5 DataFrames.DataFrame
│ Row │ x1      │ x2      │ x3  │ x4  │ x5  │
├─────┼─────────┼─────────┼─────┼─────┼─────┤
│ 1   │ 25906.0 │ 11077.0 │ 0.0 │ 0.0 │ 0.0 │
│ 2   │ 8407.0  │ 38873.0 │ 0.0 │ 0.0 │ 0.0 │
│ 3   │ 0.0     │ 0.0     │ 0.0 │ 0.0 │ 0.0 │
│ 4   │ 0.0     │ 0.0     │ 0.0 │ 0.0 │ 0.0 │
│ 5   │ 0.0     │ 0.0     │ 0.0 │ 0.0 │ 0.0 │

In [54]:
imputed = round(Int, approx)

for i=1:m
    for j=1:n
        if imputed[i, j] > 0
            imputed[i, j] = 1
        else
            imputed[i, j] = 0
        end
    end
end
new_df = convert(DataFrame, imputed)
names!(new_df, names(df))
new_df = hcat(samples, new_df)

Unnamed: 0,identifier,ADIR_Q11,ADIR_Q12,ADIR_Q13,ADIR_Q14,ADIR_Q15,ADIR_Q16,ADIR_Q18,ADIR_Q20,ADIR_Q21,ADIR_Q22,ADIR_Q23,ADIR_Q24,ADIR_Q25,ADIR_Q27,ADIR_Q29_2,ADIR_Q30,ADIR_Q31_2,ADIR_Q32_2,ADIR_Q33_2,ADIR_Q34_2,ADIR_Q35_2,ADIR_Q36_2,ADIR_Q37_2,ADIR_Q38_2,ADIR_Q39_2,ADIR_Q40_2,ADIR_Q41_2,ADIR_Q42_2,ADIR_Q43_2,ADIR_Q44_2,ADIR_Q45_2,ADIR_Q46_2,ADIR_Q47_2,ADIR_Q48_2,ADIR_Q49_2,ADIR_Q50_2,ADIR_Q51_2,ADIR_Q52_2,ADIR_Q53_2,ADIR_Q54_2,ADIR_Q55_2,ADIR_Q56_2,ADIR_Q57_2,ADIR_Q58_2,ADIR_Q59_2,ADIR_Q60_2,ADIR_Q61_2,ADIR_Q62_2,ADIR_Q63_2,ADIR_Q64_2,ADIR_Q65_2,ADIR_Q66_2,ADIR_Q67_2,ADIR_Q68_2,ADIR_Q69_2,ADIR_Q70_2,ADIR_Q71_2,ADIR_Q72_2,ADIR_Q73_2,ADIR_Q74_2,ADIR_Q75_2,ADIR_Q76_2,ADIR_Q77_2,ADIR_Q78_2,ADIR_Q79_2,ADIR_Q80_2,ADIR_Q81_2,ADIR_Q82_2,ADIR_Q83_2,ADIR_Q84_2,ADIR_Q85_2,ADIR_Q88_2,ADIR_Q89_2,ADIR_Q90_2,ADIR_Q91_2,ADIR_Q92_2,ADIR_Q93_2,ADOS_QA01,ADOS_QA02,ADOS_QA03,ADOS_QA04,ADOS_QA05,ADOS_QA06,ADOS_QA07,ADOS_QA08,ADOS_QA09,ADOS_QA10,ADOS_QA11,ADOS_QA12,ADOS_QA13,ADOS_QB01,ADOS_QB02,ADOS_QB03,ADOS_QB04,ADOS_QB05,ADOS_QB06,ADOS_QB07,ADOS_QB08,ADOS_QB09,ADOS_QB10,ADOS_QB11,ADOS_QB12,ADOS_QB13,ADOS_QB14,ADOS_QB15,ADOS_QB16,ADOS_QB17,ADOS_QB18,ADOS_QB19,ADOS_QB20,ADOS_QB21,ADOS_QB22,ADOS_QB23,ADOS_QC01,ADOS_QC02,ADOS_QD01,ADOS_QD02,ADOS_QD03,ADOS_QD04,ADOS_QD05,ADOS_QE01,ADOS_QE02,ADOS_QE03
1,AU000503,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
2,AU000504,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
3,AU001203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
4,AU001204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,1,0,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
5,AU001505,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,1,0,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
6,AU001703,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,1,0,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
7,AU001704,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
8,AU002105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
9,AU002106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
10,AU002204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,1,0,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0


In [81]:
# Evaluate clusters
print(mapslices(sum, X, 2))

[2663.89; 7012.61; 3757.51]

In [90]:
writecsv("../data/softkmeans_logloss_X$(k).csv", X)
writecsv("../data/softkmeans_logloss_Y$(k).csv", Y)