**Official Documentation**: https://clusteringjl.readthedocs.io/en/latest/index.html

In [1]:
using Clustering

**Basic inputs:**
1. X, an array of dimension d x N, where each column represents a point in dimension d
2. D, the distance matrix, recording the distances between the points
3. k, number of clusters
3. (Optional) initseeds(:kmcen, X, k), initialize the initial points (seeds) before the iterations

In [2]:
using LinearAlgebra # "norm"
N = 5000
X = rand(2,N)
## D = [norm(X[:,i]-X[:,j]) for i=1:N, j = 1:N]; ## D is not used in kmeans

In [3]:
k = 50
## initseeds(:kmcen, X, k) ## initialize the points
R = kmeans(X, k) ##; maxiter = 2000, display=:iter) ## R = Result

KmeansResult{Float64}([0.368451 0.764873 … 0.0892876 0.269491; 0.424365 0.771516 … 0.203335 0.803507], [25, 29, 18, 39, 21, 19, 31, 25, 8, 17  …  27, 29, 39, 34, 46, 17, 45, 35, 24, 27], [0.000828645, 0.00329169, 0.00752864, 0.00251532, 0.00120369, 0.00244944, 0.00391675, 0.000380992, 0.0049973, 0.00537855  …  0.00377966, 0.00124255, 0.00182683, 0.00158376, 0.00127812, 0.00208668, 0.0032218, 0.00774143, 0.00354022, 0.00556056], [119, 89, 119, 95, 92, 97, 107, 103, 115, 124  …  134, 107, 80, 85, 111, 91, 112, 99, 88, 114], [119.0, 89.0, 119.0, 95.0, 92.0, 97.0, 107.0, 103.0, 115.0, 124.0  …  134.0, 107.0, 80.0, 85.0, 111.0, 91.0, 112.0, 99.0, 88.0, 114.0], 16.07636817231873, 35, true)

In [4]:
A = assignments(R) ## the assignments of the points
N = nclusters(R) ## number of clusters
C = counts(R); ## size of each cluster

In [5]:
Cluster_Dic = Dict(i=>Int[] for i=1:k)
for i=1:k
    Cluster_Dic[i] = findall(A.==i)
end
Cluster_Dic;

In [6]:
using Plots
plotly()

Plots.PlotlyBackend()

In [8]:
scatter()
for i=1:k
    scatter!(X[1,Cluster_Dic[i]], X[2,Cluster_Dic[i]], label = "Cluster $i", markersize = 2)
end
scatter!()

### Test the clustering on the Santa Travelling Data

In [3]:
using DataFrames
using CSV
Cities = CSV.read("cities.csv")
head(Cities)

Unnamed: 0_level_0,CityId,X,Y
Unnamed: 0_level_1,Int64⍰,Float64⍰,Float64⍰
1,0,316.837,2202.34
2,1,4377.41,336.602
3,2,3454.16,2820.05
4,3,4688.1,2935.9
5,4,1010.7,3236.75
6,5,2474.23,1435.51


In [4]:
X = Array{Float64,1}(Cities[:X])   ## x-coordinates of the cities
Y = Array{Float64,1}(Cities[:Y])  ## y-coordinates of the cities
Data = [X';Y']
n = size(Data,2)

197769

In [63]:
N_upper = 50 ## The upper bound for the sizes of each group
k = N_upper
## initseeds(:kmcen, X, k) ## initialize the points
@time R = kmeans(Data, k) ##; maxiter = 2000, display=:iter) ## R = Result

  5.546762 seconds (867 allocations: 1.314 GiB, 13.07% gc time)


KmeansResult{Float64}([2666.96 4483.1 … 3571.19 4478.44; 1533.58 1704.19 … 1799.77 1320.65], [32, 28, 43, 26, 20, 35, 36, 30, 38, 4  …  13, 47, 27, 28, 24, 27, 40, 45, 33, 20], [3258.33, 26448.8, 98428.9, 43328.4, 54944.2, 32592.0, 56854.4, 8153.77, 6217.61, 50956.8  …  21822.8, 43566.2, 21236.5, 8449.82, 21356.2, 10747.5, 85828.4, 7607.28, 32272.4, 15818.7], [5760, 4074, 4489, 3724, 3513, 2557, 3825, 3853, 5958, 5241  …  4518, 3812, 4193, 2132, 5946, 3054, 3051, 3143, 3144, 3672], [5760.0, 4070.0, 4489.0, 3725.0, 3513.0, 2558.0, 3821.0, 3852.0, 5958.0, 5241.0  …  4518.0, 3809.0, 4193.0, 2132.0, 5947.0, 3054.0, 3051.0, 3143.0, 3143.0, 3669.0], 8.64439454588256e9, 100, false)

In [15]:
A = assignments(R) ## the assignments of the points
N = nclusters(R) ## number of clusters
C = counts(R); ## size of each cluster

In [18]:
Cluster_Dic = Dict(i=>Int[] for i=1:k)
for i=1:k
    Cluster_Dic[i] = findall(A.==i)
end
Cluster_Dic;

In [25]:
using StatsBase
sample

sample (generic function with 14 methods)

In [33]:
N_plot = 10000
Cities_plot = sample(collect(1:n), N_plot, replace = false)
scatter()
for i=1:k
    C_i_plot = intersect(Cluster_Dic[i], Cities_plot)
    scatter!(Data[1,C_i_plot], Data[2,C_i_plot], label = "Cluster $i", markersize = 2)
end
scatter!()

400

In [45]:
TooBig = zeros(Int, 0, 2)
for i=1:k
    if C[i]>400
        TooBig = [TooBig; [C[i] i]]
    end
end
sortslices(TooBig, dims = 1)

234×2 Array{Int64,2}:
 401  113
 402  129
 402  291
 403   78
 404   51
 404  431
 404  497
 405   94
 405  250
 407  132
 409  345
 410  443
 411  128
   ⋮     
 668   55
 672  251
 681  120
 692  237
 692  275
 709  462
 718    5
 727  179
 727  313
 738   11
 742  492
 762  280

ErrorException: cannot assign variable Rounding.RoundingMode from module Main

In [62]:
round(Int, C[113]/N_upper, RoundUp)

2

In [31]:
using DataFrames
using CSV
Cities = CSV.read("sample_submission.csv")
head(Cities)
tail(Cities)

Unnamed: 0_level_0,Path
Unnamed: 0_level_1,Int64⍰
1,197764
2,197765
3,197766
4,197767
5,197768
6,0


In [37]:
using StatsBase
Z = sample(collect(1:n-1), n-1, replace = false)
Z = [0;Z;0] ## The submission must have 0 as head and tail
df = DataFrame([Z], [:Path]); ## store as dataframe type and add the column name :Path
[head(df) tail(df)]

Unnamed: 0_level_0,Path,Path_1
Unnamed: 0_level_1,Int64,Int64
1,0,118476
2,143685,39929
3,18252,105426
4,58545,20960
5,37262,83643
6,149859,0


In [38]:
CSV.write("Z.csv", df)

"Z.csv"

In [30]:
tail(df)

Unnamed: 0_level_0,Path
Unnamed: 0_level_1,Int64
1,13847
2,137047
3,16211
4,163834
5,16395
6,157835
