In [1]:
import numpy as np
import pandas as pd
import warnings
import copy
import random
from scipy.stats import multivariate_normal
from scipy.special import logsumexp
warnings.filterwarnings("ignore")

# Read In Dataset

In [2]:
df = pd.read_csv("./codon_usage.csv")

In [3]:
df.head()

Unnamed: 0,Kingdom,DNAtype,SpeciesID,Ncodons,SpeciesName,UUU,UUC,UUA,UUG,CUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,vrl,0,100217,1995,Epizootic haematopoietic necrosis virus,0.01654,0.01203,0.0005,0.00351,0.01203,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.0005,0.0
1,vrl,0,100220,1474,Bohle iridovirus,0.02714,0.01357,0.00068,0.00678,0.00407,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.0156,0.0441,0.00271,0.00068,0.0
2,vrl,0,100755,4862,Sweet potato leaf curl virus,0.01974,0.0218,0.01357,0.01543,0.00782,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.0,0.00144
3,vrl,0,100880,1915,Northern cereal mosaic virus,0.01775,0.02245,0.01619,0.00992,0.01567,...,0.00366,0.0141,0.01671,0.0376,0.01932,0.03029,0.03446,0.00261,0.00157,0.0
4,vrl,0,100887,22831,Soil-borne cereal mosaic virus,0.02816,0.01371,0.00767,0.03679,0.0138,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.0,0.00044,0.00131


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13028 entries, 0 to 13027
Data columns (total 69 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Kingdom      13028 non-null  object 
 1   DNAtype      13028 non-null  int64  
 2   SpeciesID    13028 non-null  int64  
 3   Ncodons      13028 non-null  int64  
 4   SpeciesName  13028 non-null  object 
 5   UUU          13028 non-null  object 
 6   UUC          13028 non-null  object 
 7   UUA          13028 non-null  float64
 8   UUG          13028 non-null  float64
 9   CUU          13028 non-null  float64
 10  CUC          13028 non-null  float64
 11  CUA          13028 non-null  float64
 12  CUG          13028 non-null  float64
 13  AUU          13028 non-null  float64
 14  AUC          13028 non-null  float64
 15  AUA          13028 non-null  float64
 16  AUG          13028 non-null  float64
 17  GUU          13028 non-null  float64
 18  GUC          13028 non-null  float64
 19  GUA 

# Drop Faulty Datapoints

In [5]:
# lines 488 and 5065 are index 486 and 5063
df = df.drop([486, 5063], axis = 0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13026 entries, 0 to 13027
Data columns (total 69 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Kingdom      13026 non-null  object 
 1   DNAtype      13026 non-null  int64  
 2   SpeciesID    13026 non-null  int64  
 3   Ncodons      13026 non-null  int64  
 4   SpeciesName  13026 non-null  object 
 5   UUU          13026 non-null  object 
 6   UUC          13026 non-null  object 
 7   UUA          13026 non-null  float64
 8   UUG          13026 non-null  float64
 9   CUU          13026 non-null  float64
 10  CUC          13026 non-null  float64
 11  CUA          13026 non-null  float64
 12  CUG          13026 non-null  float64
 13  AUU          13026 non-null  float64
 14  AUC          13026 non-null  float64
 15  AUA          13026 non-null  float64
 16  AUG          13026 non-null  float64
 17  GUU          13026 non-null  float64
 18  GUC          13026 non-null  float64
 19  GUA 

# Drop unused features

In [7]:
# drop unused features
vals = [2, 3, 4, 5]
df = df.drop(df.columns[vals], axis = 1)

In [8]:
df.head()

Unnamed: 0,Kingdom,DNAtype,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,vrl,0,0.01203,0.0005,0.00351,0.01203,0.03208,0.001,0.0401,0.00551,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.0005,0.0
1,vrl,0,0.01357,0.00068,0.00678,0.00407,0.02849,0.00204,0.0441,0.01153,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.0156,0.0441,0.00271,0.00068,0.0
2,vrl,0,0.0218,0.01357,0.01543,0.00782,0.01111,0.01028,0.01193,0.02283,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.0,0.00144
3,vrl,0,0.02245,0.01619,0.00992,0.01567,0.01358,0.0094,0.01723,0.02402,...,0.00366,0.0141,0.01671,0.0376,0.01932,0.03029,0.03446,0.00261,0.00157,0.0
4,vrl,0,0.01371,0.00767,0.03679,0.0138,0.00548,0.00473,0.02076,0.02716,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.0,0.00044,0.00131


In [9]:
len(df)

13026

In [10]:
y = df.iloc[:, 0]

In [11]:
x = df.iloc[:, 1:]

In [12]:
# 64 dim data points 
x

Unnamed: 0,DNAtype,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,AUC,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,0,0.01203,0.00050,0.00351,0.01203,0.03208,0.00100,0.04010,0.00551,0.02005,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.00050,0.00000
1,0,0.01357,0.00068,0.00678,0.00407,0.02849,0.00204,0.04410,0.01153,0.02510,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.01560,0.04410,0.00271,0.00068,0.00000
2,0,0.0218,0.01357,0.01543,0.00782,0.01111,0.01028,0.01193,0.02283,0.01604,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.00000,0.00144
3,0,0.02245,0.01619,0.00992,0.01567,0.01358,0.00940,0.01723,0.02402,0.02245,...,0.00366,0.01410,0.01671,0.03760,0.01932,0.03029,0.03446,0.00261,0.00157,0.00000
4,0,0.01371,0.00767,0.03679,0.01380,0.00548,0.00473,0.02076,0.02716,0.00867,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.00000,0.00044,0.00131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13023,0,0.03555,0.00547,0.01367,0.01276,0.02097,0.00820,0.03555,0.01459,0.03920,...,0.00820,0.01367,0.01094,0.01367,0.02279,0.02005,0.04102,0.00091,0.00091,0.00638
13024,1,0.03193,0.01984,0.00629,0.01451,0.05322,0.07644,0.01258,0.03096,0.06386,...,0.00145,0.00000,0.00048,0.00194,0.01306,0.01838,0.00677,0.00242,0.00097,0.01887
13025,1,0.03321,0.01661,0.00356,0.01127,0.05042,0.09609,0.01068,0.02728,0.06643,...,0.00000,0.00000,0.00000,0.00178,0.01661,0.02788,0.00297,0.00356,0.00119,0.02017
13026,0,0.02028,0.00767,0.01293,0.01319,0.01959,0.00715,0.03964,0.01600,0.02082,...,0.01142,0.01217,0.01196,0.02178,0.02510,0.02896,0.03959,0.00099,0.00079,0.00156


In [13]:
# ground truth for the clustering based off the kingdom
y

0        vrl
1        vrl
2        vrl
3        vrl
4        vrl
        ... 
13023    pri
13024    pri
13025    pri
13026    pri
13027    pri
Name: Kingdom, Length: 13026, dtype: object

In [14]:
x = x.to_numpy(dtype=float)

In [15]:
x

array([[0.000e+00, 1.203e-02, 5.000e-04, ..., 2.510e-03, 5.000e-04,
        0.000e+00],
       [0.000e+00, 1.357e-02, 6.800e-04, ..., 2.710e-03, 6.800e-04,
        0.000e+00],
       [0.000e+00, 2.180e-02, 1.357e-02, ..., 3.910e-03, 0.000e+00,
        1.440e-03],
       ...,
       [1.000e+00, 3.321e-02, 1.661e-02, ..., 3.560e-03, 1.190e-03,
        2.017e-02],
       [0.000e+00, 2.028e-02, 7.670e-03, ..., 9.900e-04, 7.900e-04,
        1.560e-03],
       [1.000e+00, 3.724e-02, 1.732e-02, ..., 1.560e-03, 1.140e-03,
        2.161e-02]])

In [16]:
y = y.to_numpy()

In [17]:
y

array(['vrl', 'vrl', 'vrl', ..., 'pri', 'pri', 'pri'], dtype=object)

# Get True Clusters

In [18]:
true_clusters = {}
for i in range(len(y)):
    if y[i] not in true_clusters.keys():
        true_clusters[y[i]] = []
        true_clusters[y[i]].append(x[i])
    else:
        true_clusters[y[i]].append(x[i])
    

In [19]:
for key in true_clusters.keys():
    print(len(true_clusters[key]))

2831
126
2919
220
18
2523
1345
2077
572
215
180


# Function for NMI Metric

In [20]:
def NMI(x, clusters, true_clusters):
    T = []
    for key in true_clusters.keys():
        T.append(np.array(true_clusters[key]))
    T = np.array(T)
    
    k = len(clusters)
    
    # get entropy of the predicted clustering
    summ = 0
    for i in range(k):
        pci = len(clusters[i]) / len(x) + 1e-6
        val = pci * np.log(pci)
        summ += val
    # H(C)
    pred_entropy = -1 * summ
    
    # get entropy of the true clustering
    summ = 0
    for i in range(k):
        pti = len(T[i]) / len(x) + 1e-6
        val = pti * np.log(pti)
        summ += val
    # H(T)
    true_entropy = -1 * summ
    
    # get the conditional entropy
    summ = 0
    for i in range(k):
        for j in range(k):
            # get the intersecion between the two clusters
            intersect = 0
            for a in range(len(clusters[i])):
                if clusters[i][a] in T[j]:
                    intersect += 1
            
            
            # get p_ij and p_ci
            pij = intersect / len(x) + 1e-6
            pci = len(clusters[i]) / len(x) + 1e-6
            val = pij * np.log(pij / pci)
            summ += val
    # H(T|C)
    conditional_entropy = -1 * summ
    
    # calculate I(C, T) = H(T) - H(T|C)
    mutual_information = true_entropy - conditional_entropy
    
    # Finally calculate the NMI
    nmi = (mutual_information) / np.sqrt(pred_entropy * true_entropy)
    return nmi
    
            
            

# Expectation-Maximization Algorithm

In [21]:
def EM(x, k, epsilon, true_clusters):
    t = 0
    # ----- INITIALIZE ------ # 
    # first randomly pick k points as the initial cluster centers
    means = []
    for i in range(k):
        val = np.random.randint(0, len(x)-1)
        means.append(x[val])
    means = np.array(means)
    
    # initialize k covariance matrices (d x d matrix)
    covs = []
    initial_clusters = []
    for i in range(k):
        initial_clusters.append([])
    # put all points in clusters
    for i in range(len(x)):
        point = x[i]
        norms = []
        for j in range(k):
            normal = np.linalg.norm(x[i] - means[j])
            norms.append(normal)
        index = np.argmin(norms)
        initial_clusters[index].append(point)
    # turn into numpy object
    for i in range(len(initial_clusters)):
        initial_clusters[i] = np.array(initial_clusters[i])
    initial_clusters = np.array(initial_clusters)
    
    # now for each cluster, get the cov matrix
    for i in range(k):
        if len(initial_clusters[i]) == 0:
            c = np.identity(64)
            covs.append(c)
        else:
            c = np.cov(initial_clusters[i], rowvar=False, bias=True)
            covs.append(c)
    
    
    # initialize P(Ci)
    P_Ci = []
    for i in range(k):
        val = 1 / k
        P_Ci.append(val)
    # get P(Ci) into log space
    P_Ci = np.array(P_Ci)
    P_Ci = np.log(P_Ci)
    
    # expectation step
    old_means = copy.deepcopy(means)
    
    # ------ MAIN LOOP ------ #
    while(True):
        
        old_means = copy.deepcopy(means)
        
        # pdf for all i in k
        pdf = np.zeros((len(x), k))
        for i in range(k):
            pdf[:, i] = multivariate_normal.logpdf(x, mean=means[i], cov=covs[i], allow_singular=True)
        
        # get logsumexp
        t1 = pdf + P_Ci
        lse = logsumexp(t1, axis = 1)
        
        
        # get the w matrix
        w = pdf + P_Ci
        for i in range(len(w)):
            w[i] = w[i] - lse[i]
        
        w = np.exp(w)
        
        # Maximization step
        # get new extimate for mean
        for i in range(k):
            denom = np.dot(w[:, i], np.ones((len(w),1)))
            num = np.dot(x.T, w[:, i])
            means[i] = num / denom
            
        # get new cov matrix estimates
        for i in range(k):
            # get centered points 
            x_center = x - means[i]
            # iteratively sum outer products
            new_cov = np.zeros((64, 64))
            for j in range(len(x)):
                outer_prod = np.outer(x_center[j], x_center[j])
                outer_prod = outer_prod * w[j][i]
                new_cov = new_cov + outer_prod
            # divide by the sum of weights
            denom = np.dot(w[:, i], np.ones((len(w),1)))
            
            new_cov = new_cov / denom
            # save the new matrix
            covs[i] = new_cov
        
        # get new prior probabilities
        for i in range(k):
            num = np.dot(w[:, i], np.ones((len(w),1)))
            P_Ci[i] = num / len(x)
        P_Ci = np.log(P_Ci)
        
        t += 1    
        # check break condition
        summ = 0
        for i in range(k):
            diff = means[i] - old_means[i]
            norm2 = np.linalg.norm(diff) ** 2
            summ += norm2
        
        if summ <= epsilon:
            break
    
    # ------ FINAL CLUSTERING ------ #
    clusters = []
    for i in range(k):
        clusters.append([])
    for i in range(len(x)):
        point = x[i]
        norms = []
        for j in range(k):
            normal = np.linalg.norm(x[i] - means[j])
            norms.append(normal)
        index = np.argmin(norms)
        clusters[index].append(point)
    
    for i in range(len(clusters)):
        clusters[i] = np.array(clusters[i])
    clusters = np.array(clusters)
    
    # ------ OUTPUT ------ #
    for i in range(k):
        print("Size of cluster " + str(i) + ":", len(clusters[i]))
        
    for i in range(k):
        print()
        print("Final Mean for cluster " + str(i) + ":")
        print(means[i])
        print()
        print("Final Covariance Matrix for cluster " + str(i) + ":")
        print(covs[i])
    
    nmi = NMI(x, clusters, true_clusters)
    print()
    print("NMI:", nmi)
    
    return nmi, clusters, means

# Algorithm Execution

### First run the function once to show the output of the EM algorithm

In [22]:
v1, v2, v3 = EM(x, 11, 1e-2, true_clusters)

Size of cluster 0: 1614
Size of cluster 1: 762
Size of cluster 2: 22
Size of cluster 3: 1686
Size of cluster 4: 2899
Size of cluster 5: 1614
Size of cluster 6: 0
Size of cluster 7: 862
Size of cluster 8: 3156
Size of cluster 9: 0
Size of cluster 10: 411

Final Mean for cluster 0:
[0.00115202 0.0170344  0.02088943 0.02000019 0.01709167 0.00996761
 0.01173825 0.01070414 0.02865562 0.01546125 0.02297319 0.02547781
 0.02305995 0.01076486 0.01416213 0.01624686 0.02120818 0.01012496
 0.02172163 0.00555504 0.01424819 0.00636704 0.01841778 0.00482468
 0.01218799 0.01838805 0.0095129  0.02009999 0.01015194 0.01768714
 0.00874706 0.02078201 0.00544245 0.01469675 0.00942235 0.02018932
 0.01064279 0.02488068 0.00608472 0.0234665  0.01442797 0.02177454
 0.01384418 0.03364512 0.01917154 0.01354546 0.0081861  0.01401704
 0.00853606 0.03824864 0.02697456 0.00534853 0.00337382 0.00495802
 0.00285593 0.01850659 0.01044923 0.03688288 0.01788834 0.03561201
 0.02315776 0.00123572 0.00049304 0.00059707]

Fi


NMI: 0.9504325272402895


### Run the algorithm 10 times and get the means with the best NMI score

In [23]:
scores = []
all_clusters = []
centers = []
for i in range(10):
    score, cluster, center = EM(x, 11, 1e-2, true_clusters)
    scores.append(score)
    all_clusters.append(cluster)
    centers.append(center)
    print("Execution:", i+1, "done.\n\n")


Size of cluster 0: 1307
Size of cluster 1: 1855
Size of cluster 2: 0
Size of cluster 3: 0
Size of cluster 4: 1250
Size of cluster 5: 0
Size of cluster 6: 2075
Size of cluster 7: 511
Size of cluster 8: 1592
Size of cluster 9: 3574
Size of cluster 10: 862

Final Mean for cluster 0:
[1.00000000e+00 3.95744171e-02 2.67659519e-02 3.18329786e-03
 1.95055356e-02 3.16684069e-02 7.63197501e-02 9.50332653e-03
 3.71740319e-02 5.12785364e-02 4.10743075e-02 9.42900781e-03
 8.14437937e-03 1.16495839e-02 2.04822413e-02 3.59649841e-03
 1.25795600e-02 2.98742641e-02 2.58503217e-02 1.45466362e-03
 9.83962391e-03 1.79518980e-02 3.03677818e-02 1.68320558e-03
 2.48797531e-03 7.28813439e-03 1.62007354e-02 2.57592843e-02
 6.32213886e-03 1.00051315e-02 1.96428674e-02 2.89337877e-02
 1.66960998e-03 2.64853880e-03 8.42259071e-03 1.45154753e-02
 3.18792993e-02 4.02193986e-02 2.14138481e-03 1.16375272e-02
 1.95940098e-02 2.09150151e-02 2.27920821e-03 1.18571303e-02
 2.99162130e-02 1.95093510e-03 5.35762796e-03 7.


NMI: 0.9461679745510432
Execution: 1 done.


Size of cluster 0: 7
Size of cluster 1: 862
Size of cluster 2: 1685
Size of cluster 3: 0
Size of cluster 4: 1239
Size of cluster 5: 438
Size of cluster 6: 776
Size of cluster 7: 2848
Size of cluster 8: 0
Size of cluster 9: 34
Size of cluster 10: 5137

Final Mean for cluster 0:
[0.000e+00 1.030e-02 2.535e-02 1.038e-02 8.770e-03 3.060e-03 3.380e-03
 8.900e-04 2.229e-02 5.710e-03 1.255e-02 6.920e-03 2.663e-02 4.350e-03
 1.416e-02 3.300e-03 5.077e-02 9.740e-03 4.265e-02 1.600e-04 5.630e-03
 1.210e-03 1.263e-02 0.000e+00 1.070e-02 1.987e-02 2.660e-03 4.570e-02
 1.370e-03 2.197e-02 3.620e-03 3.661e-02 8.900e-04 1.794e-02 5.870e-03
 5.801e-02 8.850e-03 6.622e-02 7.200e-04 2.631e-02 7.640e-03 7.890e-03
 1.210e-03 5.665e-02 7.720e-03 6.823e-02 2.567e-02 8.050e-03 2.090e-03
 4.562e-02 1.416e-02 4.800e-04 0.000e+00 6.400e-04 8.000e-05 1.601e-02
 5.600e-04 4.699e-02 4.750e-03 2.366e-02 3.940e-03 2.873e-02 4.430e-03
 1.450e-03]

Final Covariance Matrix 


NMI: 1.0215134404983188
Execution: 2 done.


Size of cluster 0: 0
Size of cluster 1: 1321
Size of cluster 2: 2559
Size of cluster 3: 3972
Size of cluster 4: 933
Size of cluster 5: 5
Size of cluster 6: 1796
Size of cluster 7: 862
Size of cluster 8: 1083
Size of cluster 9: 0
Size of cluster 10: 495

Final Mean for cluster 0:
[1.24427880e+00 1.15371643e-02 8.85142104e-02 1.54201994e-02
 1.53711686e-02 2.14503799e-03 1.18393688e-02 2.15376324e-03
 7.11976689e-02 9.10586079e-03 4.99958547e-02 1.72540083e-02
 2.73574635e-02 2.82912026e-03 2.18323492e-02 5.87943392e-03
 2.10238199e-02 3.84874798e-03 1.42089774e-02 2.36757010e-03
 1.61100007e-02 2.89352691e-03 1.18300089e-02 1.54129136e-03
 6.44347681e-03 2.32150556e-02 3.17790952e-03 2.10294639e-02
 7.19773741e-03 2.63411657e-02 3.72194500e-03 2.18760299e-02
 2.44999912e-03 1.62747925e-02 3.00996580e-03 2.12165602e-02
 3.76545363e-03 1.89129062e-02 1.87946249e-03 3.95097632e-02
 6.78529330e-03 1.87831447e-02 3.09018872e-03 4.70553264e-02
 8.


NMI: 0.9601225861379228
Execution: 3 done.


Size of cluster 0: 1794
Size of cluster 1: 46
Size of cluster 2: 1724
Size of cluster 3: 832
Size of cluster 4: 816
Size of cluster 5: 2113
Size of cluster 6: 2067
Size of cluster 7: 184
Size of cluster 8: 1574
Size of cluster 9: 1204
Size of cluster 10: 672

Final Mean for cluster 0:
[0.         0.0221265  0.00857583 0.01618524 0.01451347 0.01986441
 0.01122644 0.020391   0.01665154 0.02190309 0.01313958 0.02501506
 0.01622286 0.01886881 0.00984326 0.02426135 0.02008591 0.02396631
 0.02013492 0.01140902 0.01420685 0.01564936 0.0180155  0.00884749
 0.01512673 0.01597567 0.01764538 0.02063876 0.01617918 0.01244182
 0.01454506 0.01452885 0.00677235 0.01002113 0.01313271 0.01639443
 0.02083633 0.01861066 0.00917867 0.01257983 0.01965304 0.01761851
 0.01889808 0.01725795 0.02484445 0.00796604 0.01030594 0.00954468
 0.01324396 0.02309482 0.0294945  0.00557856 0.00785215 0.00581766
 0.00566116 0.0150354  0.01491185 0.02343516 0.0275766  0.0262729


NMI: 0.8924973978352796
Execution: 4 done.


Size of cluster 0: 0
Size of cluster 1: 750
Size of cluster 2: 816
Size of cluster 3: 313
Size of cluster 4: 576
Size of cluster 5: 0
Size of cluster 6: 4267
Size of cluster 7: 883
Size of cluster 8: 46
Size of cluster 9: 1260
Size of cluster 10: 4115

Final Mean for cluster 0:
[0.16323007 0.0106004  0.04241195 0.01422515 0.01666475 0.0043655
 0.01032427 0.00485063 0.04321429 0.01301729 0.02406046 0.02239471
 0.02616842 0.00639068 0.02007689 0.0083268  0.0245301  0.00725031
 0.0229017  0.00677049 0.01309572 0.00291155 0.01384146 0.00384528
 0.00960313 0.02390869 0.00848119 0.02043682 0.00637008 0.01837687
 0.00438696 0.01739426 0.00408411 0.01624097 0.00726892 0.02068511
 0.0070798  0.02229444 0.00605078 0.02989149 0.00955093 0.02920288
 0.00786466 0.04575875 0.01587821 0.00718793 0.00266906 0.01325583
 0.00480862 0.06363641 0.01767008 0.0101339  0.00375069 0.00461705
 0.00154397 0.01530902 0.00325697 0.04224416 0.01207851 0.05116546
 0.014


NMI: 1.0049950930234348
Execution: 5 done.


Size of cluster 0: 3422
Size of cluster 1: 0
Size of cluster 2: 319
Size of cluster 3: 408
Size of cluster 4: 2009
Size of cluster 5: 573
Size of cluster 6: 601
Size of cluster 7: 0
Size of cluster 8: 2580
Size of cluster 9: 862
Size of cluster 10: 2252

Final Mean for cluster 0:
[0.01211993 0.02280109 0.01016203 0.01608047 0.01621522 0.01696035
 0.00881375 0.01776354 0.01964359 0.02059282 0.01219882 0.02424565
 0.01904182 0.01685239 0.01056945 0.01959266 0.0217941  0.0202904
 0.01809989 0.00991241 0.01553292 0.01347789 0.016802   0.00840772
 0.01472295 0.01768443 0.0159586  0.02022616 0.01221582 0.01513021
 0.01427689 0.01378286 0.00802775 0.01108687 0.01273706 0.01665871
 0.0174924  0.0167549  0.00937645 0.0161643  0.0180358  0.019018
 0.0201704  0.02221374 0.02402366 0.01003874 0.01098015 0.01296611
 0.01305041 0.02599383 0.0307338  0.00905028 0.00798186 0.00660881
 0.00556182 0.01378001 0.01239603 0.02693489 0.0239658  0.02802617
 0.028


NMI: 0.9515412269309227
Execution: 6 done.


Size of cluster 0: 1384
Size of cluster 1: 47
Size of cluster 2: 923
Size of cluster 3: 0
Size of cluster 4: 998
Size of cluster 5: 612
Size of cluster 6: 0
Size of cluster 7: 649
Size of cluster 8: 3761
Size of cluster 9: 3032
Size of cluster 10: 1620

Final Mean for cluster 0:
[0.         0.02592233 0.00418623 0.01327958 0.01193475 0.02204126
 0.00439079 0.03395151 0.01579566 0.02958292 0.00416507 0.02417392
 0.01365127 0.02430988 0.00649401 0.02516093 0.01852314 0.03695071
 0.01485864 0.02450956 0.00989348 0.01611201 0.00858286 0.01766966
 0.01526209 0.01879835 0.03546066 0.01184753 0.01111307 0.00894121
 0.01533337 0.00670803 0.01336935 0.00661955 0.01652154 0.00999637
 0.02554817 0.0081725  0.01506367 0.01037935 0.02007861 0.01165425
 0.02661536 0.0124879  0.02721174 0.00389984 0.00961186 0.00891536
 0.01464492 0.01547774 0.03015986 0.01086081 0.02146557 0.00560728
 0.00917789 0.00402027 0.00499041 0.02321228 0.03307357 0.02309906
 0.0


NMI: 0.958202905811263
Execution: 7 done.


Size of cluster 0: 862
Size of cluster 1: 977
Size of cluster 2: 0
Size of cluster 3: 369
Size of cluster 4: 1132
Size of cluster 5: 756
Size of cluster 6: 312
Size of cluster 7: 799
Size of cluster 8: 2146
Size of cluster 9: 4211
Size of cluster 10: 1462

Final Mean for cluster 0:
[1.68456845e+00 1.89323693e-02 5.50929664e-02 2.21294868e-02
 2.20990831e-02 4.86424864e-03 1.50226206e-02 6.25507381e-03
 5.13739063e-02 1.37123842e-02 3.10892169e-02 1.92621789e-02
 2.29338814e-02 5.54663745e-03 2.08890785e-02 8.74436193e-03
 2.26962510e-02 6.39366484e-03 1.29849217e-02 4.26001467e-03
 1.56633729e-02 5.72569077e-03 1.13962362e-02 3.65259403e-03
 1.28460009e-02 2.04333901e-02 4.30275486e-03 2.01398043e-02
 1.08982272e-02 2.89509284e-02 9.63822609e-03 1.96896628e-02
 5.83269700e-03 1.50829124e-02 3.39237628e-03 2.01373451e-02
 6.20037765e-03 1.26973977e-02 3.61691611e-03 3.81437945e-02
 8.93824072e-03 2.18248989e-02 5.99532068e-03 3.77479779e-02
 


NMI: 0.9284049792843245
Execution: 8 done.


Size of cluster 0: 973
Size of cluster 1: 688
Size of cluster 2: 0
Size of cluster 3: 0
Size of cluster 4: 2567
Size of cluster 5: 332
Size of cluster 6: 1953
Size of cluster 7: 322
Size of cluster 8: 3531
Size of cluster 9: 862
Size of cluster 10: 1798

Final Mean for cluster 0:
[0.02500022 0.02890021 0.00050674 0.00712068 0.00505528 0.03005785
 0.00141048 0.05269565 0.00470849 0.03511419 0.00117404 0.02061836
 0.0048385  0.03583045 0.00284334 0.03463076 0.00712641 0.06049979
 0.00880683 0.04617338 0.00324818 0.01933342 0.0028521  0.03027319
 0.01484998 0.01113506 0.05482383 0.00692991 0.01487837 0.00186142
 0.01473339 0.00198135 0.01785586 0.0022736  0.01556579 0.00249454
 0.03262343 0.00278445 0.01857782 0.00521124 0.01847824 0.0045484
 0.02772956 0.00436014 0.0212276  0.00134513 0.0083148  0.00545191
 0.0176302  0.00424543 0.02513303 0.00757287 0.03707893 0.0037123
 0.02088086 0.001029   0.00319895 0.0124279  0.0480297  0.01689168
 0.04


NMI: 0.9457728483878101
Execution: 9 done.


Size of cluster 0: 862
Size of cluster 1: 8
Size of cluster 2: 2108
Size of cluster 3: 238
Size of cluster 4: 553
Size of cluster 5: 1241
Size of cluster 6: 1627
Size of cluster 7: 1799
Size of cluster 8: 3768
Size of cluster 9: 822
Size of cluster 10: 0

Final Mean for cluster 0:
[1.98180444e+00 2.13208863e-02 3.77029027e-02 2.37220521e-02
 2.41977980e-02 5.65411620e-03 1.56536024e-02 7.19354949e-03
 4.13726693e-02 1.54129040e-02 2.25452302e-02 1.98775598e-02
 1.90567283e-02 6.41173241e-03 1.96053049e-02 7.94797964e-03
 2.29532928e-02 6.72095536e-03 1.32380050e-02 4.59637258e-03
 1.45039672e-02 6.49201079e-03 1.18614941e-02 4.46900559e-03
 1.56593454e-02 1.95645176e-02 4.60941732e-03 1.93920725e-02
 9.11545363e-03 2.94980518e-02 1.21610237e-02 1.95490672e-02
 7.27320608e-03 1.49506658e-02 3.64072685e-03 1.97609880e-02
 7.03204705e-03 1.04563514e-02 4.26104047e-03 3.80084620e-02
 9.36565878e-03 2.58016592e-02 7.55002754e-03 3.72499116e-02
 


NMI: 0.938616957130216
Execution: 10 done.




In [24]:
best_index = np.argmax(scores)
print("Best NMI:", np.clip(scores[best_index], 0, 1))
print()
print("Clustzer Sizes - ")
for i in range(len(all_clusters[best_index])):
    print("Size of cluster " + str(i) + ":", len(all_clusters[best_index][i]) )
    
print("Best Cluster:\n\n", all_clusters[best_index])
print()
print("Best Cluster Centers:\n")
for i in range(len(centers[best_index])):
    print(centers[best_index][i])

Best NMI: 1.0

Clustzer Sizes - 
Size of cluster 0: 7
Size of cluster 1: 862
Size of cluster 2: 1685
Size of cluster 3: 0
Size of cluster 4: 1239
Size of cluster 5: 438
Size of cluster 6: 776
Size of cluster 7: 2848
Size of cluster 8: 0
Size of cluster 9: 34
Size of cluster 10: 5137
Best Cluster:

 [array([[0.0000e+00, 2.5100e-03, 4.5740e-02, 3.7600e-03, 5.0100e-03,
         0.0000e+00, 1.2500e-03, 0.0000e+00, 3.9470e-02, 2.5100e-03,
         5.8900e-02, 8.7700e-03, 3.7590e-02, 1.8800e-03, 4.4490e-02,
         4.3900e-03, 1.7540e-02, 1.8800e-03, 3.4460e-02, 1.2500e-03,
         1.6920e-02, 0.0000e+00, 4.5110e-02, 1.2500e-03, 2.5100e-03,
         3.3210e-02, 1.0650e-02, 6.5790e-02, 6.2700e-03, 1.6290e-02,
         6.3000e-04, 2.1300e-02, 6.3000e-04, 2.8200e-02, 6.2700e-03,
         3.8220e-02, 6.8900e-03, 5.5140e-02, 0.0000e+00, 2.6940e-02,
         2.5100e-03, 2.0680e-02, 6.3000e-04, 6.8920e-02, 5.6400e-03,
         2.1930e-02, 7.5200e-03, 3.1300e-03, 0.0000e+00, 3.5710e-02,
         8