In [1]:
using DataFrames

# To run this notebook, you need to have a data file available.
# You can either run the phenotype preprocessing scripts in ../preprocessing directory
# Or scp -r username@sherlock.stanford.edu:/scratch/PI/dpwall/DATA/iHART/kpaskov/PhenotypeGLRM/data ../data

# Data
df = readtable("../data/all_samples_filtered.csv", nastrings=["None", ""])
samples = df[:, 1:1]
df = df[:, 2:end] # Remove identifier

m, n = size(df)

labels = readtable("../data/all_samples_filtered_labels.csv", nastrings=["None", ""])

println(m, " ", n)

13493 123




In [2]:
# Pull age and gender
more_data = readtable("../data/all_samples.csv", nastrings=["None"])
gender = more_data[:, :gender]
adir_age = more_data[:, :ADIR_age]
ados_age = more_data[:, :ADOS_age]
age = copy(adir_age)
age[isna(age)] = ados_age[isna(age)];


In [3]:
# Fill sparse matrix
m, n = size(df)
is = Array(Int, 0)
js = Array(Int, 0)
vs = Array(Int, 0)

for j=1:n
    for i=1:m
        if !isna(df[i,j])
            push!(is, i)
            push!(js, j)
            if(df[i, j] > 0)
                push!(vs, 1)
            else
                push!(vs, -1)
            end
        end
    end
end

p = size(is, 1)
all_data = sparse(is, js, vs, m, n);

In [None]:
using LowRankModels

losses = LogisticLoss()
#losses = QuadLoss()
k = 20

# k-means
#rx = UnitOneSparseConstraint() # k-means
#ry = ZeroReg()

# soft k-means
#rx = SimplexConstraint() 
#ry = ZeroReg()

# sparse dictionary coding
rx = NonNegOneReg()
ry = OneReg()

glrm = GLRM(all_data, losses, rx, ry, k, offset=false, scale=false);
#init_kmeanspp!(glrm);

X,Y,ch = fit!(glrm, verbose=true, max_iter=5000); # fit GLRM


LowRankModels.SparseProxGradParams(1.0,100,1,1.0e-5,0.01)
Fitting GLRM
Iteration 10: objective value = 485022.96640890266
Iteration 20: objective value = 414740.7067587958
Iteration 30: objective value = 381125.34302299033
obj went up to 373024.88298977114; reducing step size to 3.502231979423655
obj went up to 372230.80011471355; reducing step size to 2.574140504876387
Iteration 40: objective value = 368771.4896175043
obj went up to 368899.16449252213; reducing step size to 2.0859225813702698
obj went up to 364500.769858033; reducing step size to 1.6903012897811311
Iteration 50: objective value = 362033.6055710165
obj went up to 360846.3397772147; reducing step size to 1.5101102598523397
Iteration 60: objective value = 357923.03122626664
obj went up to 357812.7124493263; reducing step size to 1.349128117394138
Iteration 70: objective value = 355100.2324286498
obj went up to 355314.77708534664; reducing step size to 1.3288511209419063


In [5]:
# Check cluster distribution
print(mapslices(sum, X, 2))

[8003.49; 5489.51]

In [8]:
# Output to file
X_df = convert(DataFrame, X.')
X_df = hcat(samples, X_df)
writetable("../data/softkmeans_logloss_X$(k).csv", X_df, separator = ',', header=false)
Y_df = convert(DataFrame, Y)
print(size(names(df)), size(Y_df))
names!(Y_df, names(df))
writetable("../data/softkmeans_logloss_Y$(k).csv", Y_df, separator = ',', header=true)

(123,)(2,123)