In [90]:
using LMCLUS
include("NMI.jl")
using FreqTables

### 3-d Dataset with 4 2-d linear manifold clusters with 5% (of the smallest SL) perturbation and 30% (of the largest SL) separation  

In [146]:
data_original = readdlm("d:\\LMC_Datasets\\data_3_4_5_30.csv", header=false, ',');
trueLab = data_original[end,:];
data = data_original[1:end-1,:];

#color map for plots
cDict = Dict("1" => "red", "2" => "blue", "3" => "green", "4" => "violet", "5"=>"black", "6"=>"orange", "7"=>"magenta");

In [147]:
function normalizeArr(x)
    for i=1:size(x)[1]
        x[i,:] = (x[i,:]-minimum(x[i,:]))/(maximum(x[i,:])-minimum(x[i,:]));
    end
end

normalizeArr(data);

In [81]:
function genPredLab(Ms, l)
    predLab = ones(l);

    for i=1:length(Ms)
        lab = labels(Ms[i]);
        predLab[lab] = i;
    end

    return predLab;
end

genPredLab (generic function with 1 method)

In [150]:
params = LMCLUS.Parameters(2);
Ms = lmclus(data,params)

5-element Array{LMCLUS.Manifold,1}:
 Manifold (dim = 1, size = 955) 
 Manifold (dim = 1, size = 964) 
 Manifold (dim = 1, size = 1000)
 Manifold (dim = 2, size = 1000)
 Manifold (dim = 0, size = 81)  

In [153]:
predLab = genPredLab(Ms, length(trueLab));
print(getNMI(freqtable(trueLab,predLab)))
freqtable(trueLab,predLab)

0.97

4×5 Named Array{Int64,2}
Dim1 ╲ Dim2 │  1.0   2.0   3.0   4.0   5.0
────────────┼─────────────────────────────
1.0         │    0     0     0  1000     0
2.0         │    0   964     0     0    36
3.0         │  955     0     0     0    45
4.0         │    0     0  1000     0     0

In [156]:
Ms[4].proj[:,1]

3-element Array{Float64,1}:
  0.458894
 -0.127286
 -0.879326

In [152]:
trace1 = scatter3d(;x=data_original[1,:], y=data_original[2,:], z=data_original[3,:], mode="markers", marker_size=0.5, marker_color=map(x->cDict[string(Int(x))], trueLab))
layout = Layout(;title="Original Clusters");
plot(trace1,layout)
#trace2 = scatter3d(;x=data_original[1,1:2], y=[minimum(data_original[2,:]),maximum(data_original[2,:])], z=[minimum(data_original[3,:]),maximum(data_original[3,:])], mode="lines")

In [158]:
#Plot the clustered points
trace1 = scatter3d(;x=data_original[1,:], y=data_original[2,:], z=data_original[3,:], mode="markers", marker_size=0.5, marker_color=map(x->cDict[string(Int(x))], predLab))

#Plot the first clusters manifold
C1 = mean(data_original[1:3,Ms[1].points],2);
mu1 = Ms[1].proj;
trace2 = scatter3d(;x=[C1[1]-5*mu1[1],C1[1]+5*mu1[1]], y=[C1[2]-5*mu1[2],C1[2]+5*mu1[2]], z=[C1[3]-5*mu1[3],C1[3]+5*mu1[3]], mode="lines", marker_color="red")

#Plot the second clusters manifold
C2 = mean(data_original[1:3,Ms[2].points],2);
mu2 = Ms[2].proj;
trace3 = scatter3d(;x=[C2[1]-10*mu2[1],C2[1]+10*mu2[1]], y=[C2[2]-10*mu2[2],C2[2]+10*mu2[2]], z=[C2[3]-10*mu2[3],C2[3]+10*mu2[3]], mode="lines", marker_color="blue")

#Plot the third clusters manifold
C3 = mean(data_original[1:3,Ms[3].points],2);
mu3 = Ms[3].proj;
trace4 = scatter3d(;x=[C3[1]-10*mu3[1],C3[1]+10*mu3[1]], y=[C3[2]-10*mu3[2],C3[2]+10*mu3[2]], z=[C3[3]-10*mu3[3],C3[3]+10*mu3[3]], mode="lines", marker_color="green")

#Plot the fourth clusters manifold
C4 = mean(data_original[1:3,Ms[4].points],2);
mu4 = Ms[4].proj;
trace5 = scatter3d(;x=[C4[1]-5*mu4[1,1],C4[1]+5*mu4[1,1]], y=[C4[2]-5*mu4[2,1],C4[2]+5*mu4[2,1]], z=[C4[3]-5*mu4[3,1],C4[3]+5*mu4[3,1]], mode="lines", marker_color="violet")
trace6 = scatter3d(;x=[C4[1]-5*mu4[1,2],C4[1]+5*mu4[1,2]], y=[C4[2]-5*mu4[2,2],C4[2]+5*mu4[2,2]], z=[C4[3]-5*mu4[3,2],C4[3]+5*mu4[3,2]], mode="lines", marker_color="violet")

layout = Layout(title="LMCLUS Clusters",showlegend=false);
plot([trace1,trace2,trace3,trace4,trace5,trace6],layout)

In [179]:
using Clustering

R = kmeans(data, 4);
kMeansPred = Clustering.assignments(R);
print(getNMI(freqtable(trueLab,kMeansPred)))
freqtable(trueLab,kMeansPred)

0.98

4×4 Named Array{Int64,2}
Dim1 ╲ Dim2 │    1     2     3     4
────────────┼───────────────────────
1.0         │    0     0     0  1000
2.0         │    0     0   972    28
3.0         │ 1000     0     0     0
4.0         │    0  1000     0     0

In [177]:
trace1 = scatter3d(;x=data_original[1,:], y=data_original[2,:], z=data_original[3,:], mode="markers", marker_size=0.5, marker_color=map(x->cDict[string(Int(x))], kMeansPred))
layout = Layout(;title="K-Means Clusters");
plot(trace1,layout)

In [159]:
function plotHist(M, data, predLab)
    Dis = distance_to_manifold(data,M.μ, M.proj);
    index = 1:size(data,2);
    b = bar(;x=index, y=Dis, marker_color=map(x->cDict[string(Int(x))], predLab), line=false);
    return(b)
end

plotHist (generic function with 1 method)

In [160]:
layout = Layout(title="Distance histogram",showlegend=false,xaxis_ticks=false);
b1 = plotHist(Ms[1],data, predLab);
index = setdiff(1:size(data,2),Ms[1].points);
b2 = plotHist(Ms[2], data[:,index], predLab[index]);
index = setdiff(index,Ms[2].points);
b3 = plotHist(Ms[3], data[:,index], predLab[index]);
index = setdiff(index,Ms[3].points);
b4 = plotHist(Ms[4], data[:, index], predLab[index]);

p = [plot(b1,layout) plot(b2,layout); plot(b3,layout) plot(b4,layout)];
p.plot.layout["showlegend"] = false;
p

### 3-d Dataset with 4 2-d linear manifold clusters with 5% (of the smallest SL) perturbation and 20% (of the largest SL) separation  

In [161]:
data_original2 = readdlm("d:\\LMC_Datasets\\data_3_4_5_20.csv", header=false, ',');
trueLab2 = data_original2[end,:];
data2 = data_original2[1:end-1,:];

normalizeArr(data2);

In [171]:
trace1 = scatter3d(;x=data_original2[1,:], y=data_original2[2,:], z=data_original2[3,:], mode="markers", marker_size=0.5, marker_color=map(x->cDict[string(Int(x))], trueLab2))
layout = Layout(;title="Original Clusters");
plot(trace1,layout)

In [169]:
params = LMCLUS.Parameters(2);
Ms2 = lmclus(data2,params)

5-element Array{LMCLUS.Manifold,1}:
 Manifold (dim = 1, size = 1000)
 Manifold (dim = 1, size = 969) 
 Manifold (dim = 1, size = 1000)
 Manifold (dim = 2, size = 989) 
 Manifold (dim = 0, size = 42)  

In [170]:
predLab2 = genPredLab(Ms2, length(trueLab2));
print(getNMI(freqtable(trueLab2,predLab2)))
freqtable(trueLab2,predLab2)

0.97

4×5 Named Array{Int64,2}
Dim1 ╲ Dim2 │  1.0   2.0   3.0   4.0   5.0
────────────┼─────────────────────────────
1.0         │    0    11     0   989     0
2.0         │ 1000     0     0     0     0
3.0         │    0     0  1000     0     0
4.0         │    0   958     0     0    42

In [174]:
#Plot the clustered points
trace1 = scatter3d(;x=data_original2[1,:], y=data_original2[2,:], z=data_original2[3,:], mode="markers", marker_size=0.5, marker_color=map(x->cDict[string(Int(x))], predLab2))

#Plot the first clusters manifold
C1 = mean(data_original2[1:3,Ms2[1].points],2);
mu1 = Ms2[1].proj;
trace2 = scatter3d(;x=[C1[1]-8*mu1[1],C1[1]+8*mu1[1]], y=[C1[2]-8*mu1[2],C1[2]+8*mu1[2]], z=[C1[3]-8*mu1[3],C1[3]+8*mu1[3]], mode="lines", marker_color="red")

#Plot the second clusters manifold
C2 = mean(data_original2[1:3,Ms2[2].points],2);
mu2 = Ms2[2].proj;
trace3 = scatter3d(;x=[C2[1]-8*mu2[1],C2[1]+8*mu2[1]], y=[C2[2]-8*mu2[2],C2[2]+8*mu2[2]], z=[C2[3]-8*mu2[3],C2[3]+8*mu2[3]], mode="lines", marker_color="blue")

#Plot the third clusters manifold
C3 = mean(data_original2[1:3,Ms2[3].points],2);
mu3 = Ms2[3].proj;
trace4 = scatter3d(;x=[C3[1]-10*mu3[1],C3[1]+10*mu3[1]], y=[C3[2]-10*mu3[2],C3[2]+10*mu3[2]], z=[C3[3]-10*mu3[3],C3[3]+10*mu3[3]], mode="lines", marker_color="green")

#Plot the fourth clusters manifold
C4 = mean(data_original2[1:3,Ms2[4].points],2);
mu4 = Ms2[4].proj[:,1];
trace5 = scatter3d(;x=[C4[1]-8*mu4[1],C4[1]+8*mu4[1]], y=[C4[2]-8*mu4[2],C4[2]+8*mu4[2]], z=[C4[3]-8*mu4[3],C4[3]+8*mu4[3]], mode="lines", marker_color="violet")
mu5 = Ms2[4].proj[:,2];
trace6 = scatter3d(;x=[C4[1]-8*mu5[1],C4[1]+8*mu5[1]], y=[C4[2]-8*mu5[2],C4[2]+8*mu5[2]], z=[C4[3]-8*mu5[3],C4[3]+8*mu5[3]], mode="lines", marker_color="violet")


layout = Layout(title="LMCLUS Clusters",showlegend=false);
plot([trace1,trace2,trace3,trace4,trace5,trace6],layout)

In [175]:
layout = Layout(title="Distance histogram",showlegend=false,xaxis_ticks=false);
b1 = plotHist(Ms2[1], data2, predLab2);
index = setdiff(1:size(data2,2),Ms2[1].points);
b2 = plotHist(Ms2[2], data2[:,index], predLab2[index]);
index = setdiff(index, Ms2[2].points);
b3 = plotHist(Ms2[3], data2[:,index], predLab2[index]);
index = setdiff(index,Ms2[3].points);
b4 = plotHist(Ms2[4], data2[:, index], predLab2[index]);

p = [plot(b1,layout) plot(b2,layout); plot(b3,layout) plot(b4,layout)];
p.plot.layout["showlegend"] = false;
p

In [178]:
R2 = kmeans(data2, 4);
kMeansPred2 = Clustering.assignments(R2);

trace1 = scatter3d(;x=data_original2[1,:], y=data_original2[2,:], z=data_original2[3,:], mode="markers", marker_size=0.5, marker_color=map(x->cDict[string(Int(x))], kMeansPred2))
layout = Layout(;title="K-Means Clusters");
plot(trace1,layout)

In [180]:
print(getNMI(freqtable(trueLab2,kMeansPred2)))
freqtable(trueLab2,kMeansPred2)

0.96

4×4 Named Array{Int64,2}
Dim1 ╲ Dim2 │    1     2     3     4
────────────┼───────────────────────
1.0         │    0     4   996     0
2.0         │    0   964    13    23
3.0         │    0     0     0  1000
4.0         │ 1000     0     0     0