In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

ModuleNotFoundError: No module named 'matplotlib'

In [4]:
data = pd.read_csv('seed.txt', sep='\t', header=None)
data.head()

NameError: name 'pd' is not defined

In [7]:
from pso import ParticleSwarmOptimizedClustering
from particle import quantization_error, calc_sse
from utils import normalize
from kmeans import KMeans
from sklearn.metrics import silhouette_score

In [8]:
x = data.drop([7], axis=1)
x = x.values
x = normalize(x)

# K-Means

In [9]:
kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)
kmeans.fit(x)

In [10]:
predicted_kmeans = kmeans.predict(x)
print('Silhouette:', silhouette_score(x, predicted_kmeans))
print('SSE:', kmeans.SSE)
print('Quantization:', quantization_error(centroids=kmeans.centroid, data=x, labels=predicted_kmeans))

Silhouette: 0.4221267624201065
SSE: 22.026451226606625
Quantization: 0.30155146657476695


In [11]:
kmeans2 = KMeans(n_cluster=3, init_pp=True, seed=2018)
kmeans2.fit(x)
predicted_kmeans2 = kmeans2.predict(x)
print('Silhouette:', silhouette_score(x, predicted_kmeans))
print('SSE:', kmeans2.SSE)
print('Quantization:', quantization_error(centroids=kmeans2.centroid, data=x, labels=predicted_kmeans2))

Silhouette: 0.4221267624201065
SSE: 22.024363075666038
Quantization: 0.30166461874754386


# PSO

In [12]:
pso = ParticleSwarmOptimizedClustering(
        n_cluster=3, n_particles=10, data=x, hybrid=True, max_iter=2000, print_debug=50)

In [13]:
hist = pso.run()

Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856
Iteration 0051/2000 current gbest score 0.301530435970984001
Iteration 0101/2000 current gbest score 0.301246493581098396
Iteration 0151/2000 current gbest score 0.301131063618734818
Iteration 0201/2000 current gbest score 0.301119488009422731
Iteration 0251/2000 current gbest score 0.301118592547253927
Iteration 0301/2000 current gbest score 0.301118168449150569
Iteration 0351/2000 current gbest score 0.301118003803172940
Iteration 0401/2000 current gbest score 0.301117925167816602
Iteration 0451/2000 current gbest score 0.301117903621808181
Iteration 0501/2000 current gbest score 0.301117894581790058
Iteration 0551/2000 current gbest score 0.301117884645535128
Iteration 0601/2000 current gbest score 0.301117878766865077
Iteration 0651/2000 current gbest score 0.301117874951726650
Iteration 0701/2000 current gbest score 0.301117873970024819
Iteration 0751/2000 current gbest score

In [14]:
pso_kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)

In [15]:
pso_kmeans.centroid = pso.gbest_centroids.copy()
pso_kmeans.centroid

array([[0.76640635, 0.80106655, 0.70501811, 0.73618774, 0.77954643,
        0.34616409, 0.75643494],
       [0.12297356, 0.17411279, 0.38077529, 0.1859815 , 0.16483633,
        0.50057702, 0.27994542],
       [0.380113  , 0.41709821, 0.66651837, 0.36819959, 0.46459734,
        0.26501223, 0.3125601 ]])

In [16]:
predicted_pso = pso_kmeans.predict(x)
print('Silhouette:', silhouette_score(x, predicted_pso))
print('SSE:', calc_sse(centroids=pso.gbest_centroids, data=x, labels=predicted_pso))
print('Quantization:', pso.gbest_score)

Silhouette: 0.4221267624201065
SSE: 22.077653418909577
Quantization: 0.3011178731855944


# Repeated Test

### K-Means++

In [17]:
kmeanspp = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    kmean_rep = KMeans(n_cluster=3, init_pp=True)
    kmean_rep.fit(x)
    predicted_kmean_rep = kmean_rep.predict(x)
    silhouette = silhouette_score(x, predicted_kmean_rep)
    sse = kmean_rep.SSE
    quantization = quantization_error(centroids=kmean_rep.centroid, data=x, labels=predicted_kmean_rep)
    kmeanspp['silhouette'].append(silhouette)
    kmeanspp['sse'].append(sse)
    kmeanspp['quantization'].append(quantization)

### PSO 

In [18]:
%%time
pso_plain = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    pso_rep = ParticleSwarmOptimizedClustering(
        n_cluster=3, n_particles=10, data=x, hybrid=False, max_iter=2000, print_debug=2000)
    pso_rep.run()
    pso_kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)
    pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
    predicted_pso_rep = pso_kmeans.predict(x)
    
    silhouette = silhouette_score(x, predicted_pso_rep)
    sse = calc_sse(centroids=pso_rep.gbest_centroids, data=x, labels=predicted_pso_rep)
    quantization = pso_rep.gbest_score
    pso_plain['silhouette'].append(silhouette)
    pso_plain['sse'].append(sse)
    pso_plain['quantization'].append(quantization)

Initial global best score 0.3617105399517535
Iteration 0001/2000 current gbest score 0.361710539951753507
Finish with gbest score 0.305641519218495528
Initial global best score 0.34371607973973806
Iteration 0001/2000 current gbest score 0.343716079739738056
Finish with gbest score 0.317904786517513360
Initial global best score 0.36780708437814785
Iteration 0001/2000 current gbest score 0.367807084378147853
Finish with gbest score 0.335791407903525740
Initial global best score 0.3741865052006312
Iteration 0001/2000 current gbest score 0.357001975957334239


  dist /= len(idx)


Finish with gbest score 0.328786011365069075
Initial global best score 0.3512698552908576
Iteration 0001/2000 current gbest score 0.329959807203160871
Finish with gbest score 0.307856729085487502
Initial global best score 0.3973251113074803
Iteration 0001/2000 current gbest score 0.392086385669132176


  dist /= len(idx)


Finish with gbest score 0.334177517428262238
Initial global best score 0.40842576129906655
Iteration 0001/2000 current gbest score 0.392661749171520136
Finish with gbest score 0.339040437918887660
Initial global best score 0.35235196875053454
Iteration 0001/2000 current gbest score 0.352351968750534539
Finish with gbest score 0.312820894560843110
Initial global best score 0.352606737445243
Iteration 0001/2000 current gbest score 0.352606737445243013
Finish with gbest score 0.306303893313251530
Initial global best score 0.355009558589699
Iteration 0001/2000 current gbest score 0.355009558589699015
Finish with gbest score 0.327544951609474666
Initial global best score 0.3958329597176247
Iteration 0001/2000 current gbest score 0.393837622982138036


  dist /= len(idx)


Finish with gbest score 0.333901387263665972
Initial global best score 0.38287697626757183
Iteration 0001/2000 current gbest score 0.365614284082737495
Finish with gbest score 0.326951395565490965
Initial global best score 0.39121453329678996
Iteration 0001/2000 current gbest score 0.388213698925619355


  dist /= len(idx)


Finish with gbest score 0.329257342237316619
Initial global best score 0.3758516252811644
Iteration 0001/2000 current gbest score 0.370381675972170143


  dist /= len(idx)


Finish with gbest score 0.314776145956207343
Initial global best score 0.370465627696307
Iteration 0001/2000 current gbest score 0.370465627696306976
Finish with gbest score 0.322589157789306247
Initial global best score 0.34026650231638095
Iteration 0001/2000 current gbest score 0.331709021415022898
Finish with gbest score 0.313885373768033993
Initial global best score 0.3753768033722515
Iteration 0001/2000 current gbest score 0.375376803372251489


  dist /= len(idx)


Finish with gbest score 0.327915126711963378
Initial global best score 0.3692182472380037
Iteration 0001/2000 current gbest score 0.369218247238003727


  dist /= len(idx)


Finish with gbest score 0.319779391059682838
Initial global best score 0.3581826003567479
Iteration 0001/2000 current gbest score 0.358182600356747927
Finish with gbest score 0.319163789438223378
Initial global best score 0.3536015306771876
Iteration 0001/2000 current gbest score 0.353601530677187581


  dist /= len(idx)


Finish with gbest score 0.321375426921985397
CPU times: total: 1min 16s
Wall time: 1min 21s


### PSO Hybrid

In [30]:
%%time
pso_hybrid = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    pso_rep = ParticleSwarmOptimizedClustering(
        n_cluster=3, n_particles=10, data=x, hybrid=True, max_iter=2000, print_debug=2000)
    pso_rep.run()
    pso_kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)
    pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
    predicted_pso_rep = pso_kmeans.predict(x)
    
    silhouette = silhouette_score(x, predicted_pso_rep)
    sse = calc_sse(centroids=pso_rep.gbest_centroids, data=x, labels=predicted_pso_rep)
    quantization = pso_rep.gbest_score
    pso_hybrid['silhouette'].append(silhouette)
    pso_hybrid['sse'].append(sse)
    pso_hybrid['quantization'].append(quantization)

Initial global best score 2.70829286554
Iteration 0001/2000 current gbest score 2.708292865537892169
Finish with gbest score 2.707647699944975894
Initial global best score 2.70829286554
Iteration 0001/2000 current gbest score 2.708292865537892169
Finish with gbest score 2.707904781184771181
Initial global best score 2.70759080945
Iteration 0001/2000 current gbest score 2.707590809454386349
Finish with gbest score 2.707590809454386349
Initial global best score 2.70829286554
Iteration 0001/2000 current gbest score 2.708292865537892169
Finish with gbest score 2.707619636849610156
Initial global best score 2.70759080945
Iteration 0001/2000 current gbest score 2.707590809454386349
Finish with gbest score 2.707590809454386349
Initial global best score 2.70759080945
Iteration 0001/2000 current gbest score 2.707590809454386349
Finish with gbest score 2.707590809454386349
Initial global best score 2.70829286554
Iteration 0001/2000 current gbest score 2.708292865537892169
Finish with gbest score

# Comparison

In [33]:
benchmark = {
    'method' : ['K-Means++', 'PSO', 'PSO Hybrid'],
    'sse_mean' : [
        np.around(np.mean(kmeanspp['sse']), decimals=10),
        np.around(np.mean(pso_plain['sse']), decimals=10),
        np.around(np.mean(pso_hybrid['sse']), decimals=10),
    ],
    'sse_stdev' : [
        np.around(np.std(kmeanspp['sse']), decimals=10),
        np.around(np.std(pso_plain['sse']), decimals=10),
        np.around(np.std(pso_hybrid['sse']), decimals=10),
    ],
    'silhouette_mean' : [
        np.around(np.mean(kmeanspp['silhouette']), decimals=10),
        np.around(np.mean(pso_plain['silhouette']), decimals=10),
        np.around(np.mean(pso_hybrid['silhouette']), decimals=10),
    ],
    'silhouette_stdev' : [
        np.around(np.std(kmeanspp['silhouette']), decimals=10),
        np.around(np.std(pso_plain['silhouette']), decimals=10),
        np.around(np.std(pso_hybrid['silhouette']), decimals=10),
    ],
    'quantization_mean' : [
        np.around(np.mean(kmeanspp['quantization']), decimals=10),
        np.around(np.mean(pso_plain['quantization']), decimals=10),
        np.around(np.mean(pso_hybrid['quantization']), decimals=10),
    ],
    'quantization_stdev' : [
        np.around(np.std(kmeanspp['quantization']), decimals=10),
        np.around(np.std(pso_plain['quantization']), decimals=10),
        np.around(np.std(pso_hybrid['quantization']), decimals=10),
    ],
}

In [34]:
benchmark

{'method': ['K-Means++', 'PSO', 'PSO Hybrid'],
 'quantization_mean': [2.7082928654999998,
  2.8964455008000001,
  2.7076448866999998],
 'quantization_stdev': [0.0, 0.1117984798, 8.5579899999999997e-05],
 'silhouette_mean': [0.42210525679999999, 0.3781104596, 0.42212676240000002],
 'silhouette_stdev': [0.0, 0.063243413799999995, 0.0],
 'sse_mean': [22.024363075699998, 28.695969913300001, 22.027339630299998],
 'sse_stdev': [0.0, 5.6574814616999998, 0.0014085896]}

In [35]:
benchmark_df = pd.DataFrame.from_dict(benchmark)
benchmark_df

Unnamed: 0,method,quantization_mean,quantization_stdev,silhouette_mean,silhouette_stdev,sse_mean,sse_stdev
0,K-Means++,2.708293,0.0,0.422105,0.0,22.024363,0.0
1,PSO,2.896446,0.111798,0.37811,0.063243,28.69597,5.657481
2,PSO Hybrid,2.707645,8.6e-05,0.422127,0.0,22.02734,0.001409


In [37]:
benchmark_df.to_excel('benchmark_res.xlsx', index=False)

In [38]:
benchmark_df.to_csv('benchmark_res.csv', index=False)