In [1]:
import numpy as np
import pandas as pd
import warnings
import copy
import random
from scipy.stats import multivariate_normal
from scipy.special import logsumexp
warnings.filterwarnings("ignore")

# Read In Dataset

In [2]:
df = pd.read_csv("./codon_usage.csv")

In [3]:
df.head()

Unnamed: 0,Kingdom,DNAtype,SpeciesID,Ncodons,SpeciesName,UUU,UUC,UUA,UUG,CUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,vrl,0,100217,1995,Epizootic haematopoietic necrosis virus,0.01654,0.01203,0.0005,0.00351,0.01203,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.0005,0.0
1,vrl,0,100220,1474,Bohle iridovirus,0.02714,0.01357,0.00068,0.00678,0.00407,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.0156,0.0441,0.00271,0.00068,0.0
2,vrl,0,100755,4862,Sweet potato leaf curl virus,0.01974,0.0218,0.01357,0.01543,0.00782,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.0,0.00144
3,vrl,0,100880,1915,Northern cereal mosaic virus,0.01775,0.02245,0.01619,0.00992,0.01567,...,0.00366,0.0141,0.01671,0.0376,0.01932,0.03029,0.03446,0.00261,0.00157,0.0
4,vrl,0,100887,22831,Soil-borne cereal mosaic virus,0.02816,0.01371,0.00767,0.03679,0.0138,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.0,0.00044,0.00131


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13028 entries, 0 to 13027
Data columns (total 69 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Kingdom      13028 non-null  object 
 1   DNAtype      13028 non-null  int64  
 2   SpeciesID    13028 non-null  int64  
 3   Ncodons      13028 non-null  int64  
 4   SpeciesName  13028 non-null  object 
 5   UUU          13028 non-null  object 
 6   UUC          13028 non-null  object 
 7   UUA          13028 non-null  float64
 8   UUG          13028 non-null  float64
 9   CUU          13028 non-null  float64
 10  CUC          13028 non-null  float64
 11  CUA          13028 non-null  float64
 12  CUG          13028 non-null  float64
 13  AUU          13028 non-null  float64
 14  AUC          13028 non-null  float64
 15  AUA          13028 non-null  float64
 16  AUG          13028 non-null  float64
 17  GUU          13028 non-null  float64
 18  GUC          13028 non-null  float64
 19  GUA 

# Drop Faulty Datapoints

In [5]:
# lines 488 and 5065 are index 486 and 5063
df = df.drop([486, 5063], axis = 0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13026 entries, 0 to 13027
Data columns (total 69 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Kingdom      13026 non-null  object 
 1   DNAtype      13026 non-null  int64  
 2   SpeciesID    13026 non-null  int64  
 3   Ncodons      13026 non-null  int64  
 4   SpeciesName  13026 non-null  object 
 5   UUU          13026 non-null  object 
 6   UUC          13026 non-null  object 
 7   UUA          13026 non-null  float64
 8   UUG          13026 non-null  float64
 9   CUU          13026 non-null  float64
 10  CUC          13026 non-null  float64
 11  CUA          13026 non-null  float64
 12  CUG          13026 non-null  float64
 13  AUU          13026 non-null  float64
 14  AUC          13026 non-null  float64
 15  AUA          13026 non-null  float64
 16  AUG          13026 non-null  float64
 17  GUU          13026 non-null  float64
 18  GUC          13026 non-null  float64
 19  GUA 

# Drop unused features

In [7]:
# drop unused features
vals = [2, 3, 4, 5]
df = df.drop(df.columns[vals], axis = 1)

In [8]:
df.head()

Unnamed: 0,Kingdom,DNAtype,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,vrl,0,0.01203,0.0005,0.00351,0.01203,0.03208,0.001,0.0401,0.00551,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.0005,0.0
1,vrl,0,0.01357,0.00068,0.00678,0.00407,0.02849,0.00204,0.0441,0.01153,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.0156,0.0441,0.00271,0.00068,0.0
2,vrl,0,0.0218,0.01357,0.01543,0.00782,0.01111,0.01028,0.01193,0.02283,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.0,0.00144
3,vrl,0,0.02245,0.01619,0.00992,0.01567,0.01358,0.0094,0.01723,0.02402,...,0.00366,0.0141,0.01671,0.0376,0.01932,0.03029,0.03446,0.00261,0.00157,0.0
4,vrl,0,0.01371,0.00767,0.03679,0.0138,0.00548,0.00473,0.02076,0.02716,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.0,0.00044,0.00131


In [9]:
len(df)

13026

In [10]:
y = df.iloc[:, 0]

In [11]:
x = df.iloc[:, 1:]

In [12]:
# 64 dim data points 
x

Unnamed: 0,DNAtype,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,AUC,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,0,0.01203,0.00050,0.00351,0.01203,0.03208,0.00100,0.04010,0.00551,0.02005,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.00050,0.00000
1,0,0.01357,0.00068,0.00678,0.00407,0.02849,0.00204,0.04410,0.01153,0.02510,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.01560,0.04410,0.00271,0.00068,0.00000
2,0,0.0218,0.01357,0.01543,0.00782,0.01111,0.01028,0.01193,0.02283,0.01604,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.00000,0.00144
3,0,0.02245,0.01619,0.00992,0.01567,0.01358,0.00940,0.01723,0.02402,0.02245,...,0.00366,0.01410,0.01671,0.03760,0.01932,0.03029,0.03446,0.00261,0.00157,0.00000
4,0,0.01371,0.00767,0.03679,0.01380,0.00548,0.00473,0.02076,0.02716,0.00867,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.00000,0.00044,0.00131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13023,0,0.03555,0.00547,0.01367,0.01276,0.02097,0.00820,0.03555,0.01459,0.03920,...,0.00820,0.01367,0.01094,0.01367,0.02279,0.02005,0.04102,0.00091,0.00091,0.00638
13024,1,0.03193,0.01984,0.00629,0.01451,0.05322,0.07644,0.01258,0.03096,0.06386,...,0.00145,0.00000,0.00048,0.00194,0.01306,0.01838,0.00677,0.00242,0.00097,0.01887
13025,1,0.03321,0.01661,0.00356,0.01127,0.05042,0.09609,0.01068,0.02728,0.06643,...,0.00000,0.00000,0.00000,0.00178,0.01661,0.02788,0.00297,0.00356,0.00119,0.02017
13026,0,0.02028,0.00767,0.01293,0.01319,0.01959,0.00715,0.03964,0.01600,0.02082,...,0.01142,0.01217,0.01196,0.02178,0.02510,0.02896,0.03959,0.00099,0.00079,0.00156


In [13]:
# ground truth for the clustering based off the kingdom
y

0        vrl
1        vrl
2        vrl
3        vrl
4        vrl
        ... 
13023    pri
13024    pri
13025    pri
13026    pri
13027    pri
Name: Kingdom, Length: 13026, dtype: object

In [14]:
x = x.to_numpy(dtype=float)

In [15]:
x

array([[0.000e+00, 1.203e-02, 5.000e-04, ..., 2.510e-03, 5.000e-04,
        0.000e+00],
       [0.000e+00, 1.357e-02, 6.800e-04, ..., 2.710e-03, 6.800e-04,
        0.000e+00],
       [0.000e+00, 2.180e-02, 1.357e-02, ..., 3.910e-03, 0.000e+00,
        1.440e-03],
       ...,
       [1.000e+00, 3.321e-02, 1.661e-02, ..., 3.560e-03, 1.190e-03,
        2.017e-02],
       [0.000e+00, 2.028e-02, 7.670e-03, ..., 9.900e-04, 7.900e-04,
        1.560e-03],
       [1.000e+00, 3.724e-02, 1.732e-02, ..., 1.560e-03, 1.140e-03,
        2.161e-02]])

In [16]:
y = y.to_numpy()

In [17]:
y

array(['vrl', 'vrl', 'vrl', ..., 'pri', 'pri', 'pri'], dtype=object)

# Get True Clusters

In [18]:
true_clusters = {}
for i in range(len(y)):
    if y[i] not in true_clusters.keys():
        true_clusters[y[i]] = []
        true_clusters[y[i]].append(x[i])
    else:
        true_clusters[y[i]].append(x[i])
    

In [19]:
for key in true_clusters.keys():
    print(len(true_clusters[key]))

2831
126
2919
220
18
2523
1345
2077
572
215
180


# Function for NMI Metric

In [20]:
def NMI(x, clusters, true_clusters):
    T = []
    for key in true_clusters.keys():
        T.append(np.array(true_clusters[key]))
    T = np.array(T)
    
    k = len(clusters)
    
    # get entropy of the predicted clustering
    summ = 0
    for i in range(k):
        pci = len(clusters[i]) / len(x) + 1e-6
        val = pci * np.log(pci)
        summ += val
    # H(C)
    pred_entropy = -1 * summ
    
    # get entropy of the true clustering
    summ = 0
    for i in range(k):
        pti = len(T[i]) / len(x) + 1e-6
        val = pti * np.log(pti)
        summ += val
    # H(T)
    true_entropy = -1 * summ
    
    # get the conditional entropy
    summ = 0
    for i in range(k):
        for j in range(k):
            # get the intersecion between the two clusters
            intersect = 0
            for a in range(len(clusters[i])):
                if clusters[i][a] in T[j]:
                    intersect += 1
            
            
            # get p_ij and p_ci
            pij = intersect / len(x) + 1e-6
            pci = len(clusters[i]) / len(x) + 1e-6
            val = pij * np.log(pij / pci)
            summ += val
    # H(T|C)
    conditional_entropy = -1 * summ
    
    # calculate I(C, T) = H(T) - H(T|C)
    mutual_information = true_entropy - conditional_entropy
    
    # Finally calculate the NMI
    nmi = (mutual_information) / np.sqrt(pred_entropy * true_entropy)
    return nmi
    
            
            

# Expectation-Maximization Algorithm

In [21]:
def EM(x, k, epsilon, true_clusters):
    t = 0
    # initialize
    # first randomly pick k points as the initial cluster centers
    means = []
    for i in range(k):
        val = np.random.randint(0, len(x)-1)
        means.append(x[val])
    means = np.array(means)
    
    # initialize k covariance matrices (d x d matrix)
    covs = []
    for i in range(k):
        c = np.identity(64)
        covs.append(c)
    covs = np.array(covs)
    
    # initialize P(Ci)
    P_Ci = []
    for i in range(k):
        val = 1 / k
        P_Ci.append(val)
    # get P(Ci) into log space
    P_Ci = np.array(P_Ci)
    P_Ci = np.log(P_Ci)
    
    # expectation step
    old_means = copy.deepcopy(means)
    
    while(True):
        
        old_means = copy.deepcopy(means)
        
        # pdf for all i in k
        pdf = np.zeros((len(x), k))
        for i in range(k):
            pdf[:, i] = multivariate_normal.logpdf(x, mean=means[i], cov=covs[i], allow_singular=True)
        
        # get logsumexp
        t1 = pdf + P_Ci
        lse = logsumexp(t1, axis = 1)
        
        
        # get the w matrix
        w = pdf + P_Ci
        for i in range(len(w)):
            w[i] = w[i] - lse[i]
        
        w = np.exp(w)
        
        # Maximization step
        # get new extimate for mean
        for i in range(k):
            denom = np.dot(w[:, i], np.ones((len(w),1)))
            num = np.dot(x.T, w[:, i])
            means[i] = num / denom
            
        # get new cov matrix estimates
        for i in range(k):
            # get centered points 
            x_center = x - means[i]
            # iteratively sum outer products
            new_cov = np.zeros((64, 64))
            for j in range(len(x)):
                outer_prod = np.outer(x_center[j], x_center[j])
                outer_prod = outer_prod * w[j][i]
                new_cov = new_cov + outer_prod
            # divide by the sum of weights
            denom = np.dot(w[:, i], np.ones((len(w),1)))
            
            new_cov = new_cov / denom
            # save the new matrix
            covs[i] = new_cov
        
        # get new prior probabilities
        for i in range(k):
            num = np.dot(w[:, i], np.ones((len(w),1)))
            P_Ci[i] = num / len(x)
        P_Ci = np.log(P_Ci)
        
        t += 1    
        # check break condition
        summ = 0
        for i in range(k):
            diff = means[i] - old_means[i]
            norm2 = np.linalg.norm(diff) ** 2
            summ += norm2
        
        if summ <= epsilon:
            break
    
    # put each point in a cluster
    clusters = []
    for i in range(k):
        clusters.append([])
    for i in range(len(x)):
        point = x[i]
        norms = []
        for j in range(k):
            normal = np.linalg.norm(x[i] - means[j])
            norms.append(normal)
        index = np.argmin(norms)
        clusters[index].append(point)
    
    for i in range(len(clusters)):
        clusters[i] = np.array(clusters[i])
    clusters = np.array(clusters)
    
    for i in range(k):
        print("Size of cluster " + str(i) + ":", len(clusters[i]))
        
    for i in range(k):
        print()
        print("Final Mean for cluster " + str(i) + ":")
        print(means[i])
        print()
        print("Final Covariance Matrix for cluster " + str(i) + ":")
        print(covs[i])
    
    nmi = NMI(x, clusters, true_clusters)
    print()
    print("NMI:", nmi)
    
    return nmi, clusters, means

# Algorithm Execution

### First run the function once to show the output of the EM algorithm

In [22]:
v1, v2, v3 = EM(x, 11, 1e-2, true_clusters)

Size of cluster 0: 1488
Size of cluster 1: 2899
Size of cluster 2: 1276
Size of cluster 3: 1933
Size of cluster 4: 473
Size of cluster 5: 275
Size of cluster 6: 862
Size of cluster 7: 169
Size of cluster 8: 1660
Size of cluster 9: 670
Size of cluster 10: 1321

Final Mean for cluster 0:
[0.         0.02188563 0.00977431 0.01700487 0.01525836 0.01662118
 0.00806421 0.02419259 0.02079234 0.02308432 0.01015079 0.02521716
 0.01799246 0.01758595 0.0099006  0.02248866 0.02110265 0.02374861
 0.01863142 0.0147827  0.01329835 0.01249304 0.01400627 0.01092373
 0.01408795 0.01812249 0.0206152  0.01701858 0.01241105 0.01294476
 0.01347802 0.01212558 0.00889956 0.01024153 0.01389269 0.01417759
 0.01882605 0.01453249 0.0107604  0.01508107 0.01750281 0.01721797
 0.02198386 0.01971663 0.02329742 0.007846   0.00955081 0.01157138
 0.01247524 0.02553645 0.02867492 0.00959845 0.01185998 0.0064348
 0.00686382 0.01050959 0.00943189 0.02773399 0.02514841 0.0292608
 0.0295925  0.00118252 0.0005515  0.00098737]


NMI: 0.893650654755784


(0.893650654755784,
 array([array([[0.     , 0.00656, 0.01324, ..., 0.00135, 0.0002 , 0.00036],
               [0.     , 0.01982, 0.00845, ..., 0.00029, 0.00029, 0.00117],
               [0.     , 0.01772, 0.00682, ..., 0.00136, 0.     , 0.00068],
               ...,
               [0.     , 0.02236, 0.00887, ..., 0.00149, 0.00068, 0.0015 ],
               [0.     , 0.03555, 0.00547, ..., 0.00091, 0.00091, 0.00638],
               [0.     , 0.02028, 0.00767, ..., 0.00099, 0.00079, 0.00156]]),
        array([[1.000e+00, 2.657e-02, 7.155e-02, ..., 2.350e-03, 1.771e-02,
                1.660e-03],
               [1.000e+00, 5.568e-02, 8.868e-02, ..., 3.350e-03, 2.600e-04,
                0.000e+00],
               [1.000e+00, 1.784e-02, 1.031e-01, ..., 1.980e-03, 5.700e-04,
                1.204e-02],
               ...,
               [1.000e+00, 3.193e-02, 1.984e-02, ..., 2.420e-03, 9.700e-04,
                1.887e-02],
               [1.000e+00, 3.321e-02, 1.661e-02, ..., 3.560e-03, 1

### Run the algorithm 10 times and get the means with the best NMI score

In [23]:
scores = []
all_clusters = []
centers = []
for i in range(10):
    score, cluster, center = EM(x, 11, 1e-2, true_clusters)
    scores.append(score)
    all_clusters.append(cluster)
    centers.append(center)
    print("Execution:", i+1, "done.")


Size of cluster 0: 862
Size of cluster 1: 8
Size of cluster 2: 103
Size of cluster 3: 2151
Size of cluster 4: 2361
Size of cluster 5: 645
Size of cluster 6: 18
Size of cluster 7: 1595
Size of cluster 8: 2945
Size of cluster 9: 0
Size of cluster 10: 2338

Final Mean for cluster 0:
[1.74314037e+00 1.91135148e-02 5.26256486e-02 2.07862769e-02
 2.22339016e-02 4.76462507e-03 1.50007457e-02 5.85632009e-03
 5.07665622e-02 1.39868197e-02 3.07047026e-02 1.85649057e-02
 2.07365126e-02 5.36275825e-03 2.00040187e-02 7.04014076e-03
 2.22532860e-02 6.08362109e-03 1.28885154e-02 3.88961298e-03
 1.51773256e-02 5.67977172e-03 1.16851436e-02 3.64454778e-03
 1.25783527e-02 1.90925493e-02 4.05601310e-03 2.05282667e-02
 8.87640255e-03 2.90266371e-02 1.00764074e-02 2.06741334e-02
 5.96279742e-03 1.47919652e-02 3.29912368e-03 2.01611160e-02
 6.25443665e-03 1.25817097e-02 3.43826565e-03 3.87088103e-02
 8.78107367e-03 2.36141873e-02 6.14274511e-03 4.03751993e-02
 1.03606538e-02 9.54434594e-03 2.68197997e-03 2.


NMI: 0.9599217768761427
Execution: 1 done.
Size of cluster 0: 0
Size of cluster 1: 0
Size of cluster 2: 3031
Size of cluster 3: 0
Size of cluster 4: 1066
Size of cluster 5: 862
Size of cluster 6: 1658
Size of cluster 7: 2634
Size of cluster 8: 2899
Size of cluster 9: 876
Size of cluster 10: 0

Final Mean for cluster 0:
[0.567749   0.01991085 0.03699659 0.01506335 0.02698371 0.01920588
 0.03012096 0.01234749 0.03773202 0.0214863  0.0326196  0.02070607
 0.02059711 0.00976797 0.01975832 0.01170243 0.02070239 0.02207615
 0.02156434 0.00622724 0.01465181 0.0120448  0.01644297 0.00526633
 0.01008744 0.01454596 0.01197951 0.01980727 0.01336542 0.0161122
 0.01279714 0.01590792 0.00446596 0.01060921 0.00952703 0.0191566
 0.0199893  0.02205983 0.00529768 0.02069704 0.01314334 0.02039035
 0.00953726 0.02448932 0.0165744  0.0060795  0.00462948 0.01049903
 0.00924456 0.0337711  0.01343136 0.00437865 0.00354208 0.00587468
 0.00286762 0.01133528 0.00666613 0.0199912  0.01079236 0.02826817
 0.0125774


NMI: 0.9761779028888768
Execution: 2 done.
Size of cluster 0: 1685
Size of cluster 1: 862
Size of cluster 2: 998
Size of cluster 3: 1440
Size of cluster 4: 2187
Size of cluster 5: 387
Size of cluster 6: 0
Size of cluster 7: 538
Size of cluster 8: 2030
Size of cluster 9: 2899
Size of cluster 10: 0

Final Mean for cluster 0:
[0.00779535 0.02365728 0.00808695 0.01681686 0.01599504 0.01850759
 0.00882452 0.02318685 0.01863911 0.02281028 0.01125311 0.02574673
 0.01780099 0.01804319 0.00935159 0.02375641 0.02218488 0.02250258
 0.018382   0.01052326 0.01490985 0.01369375 0.01596252 0.00851209
 0.01489435 0.01688273 0.01816021 0.0194093  0.0133997  0.0142453
 0.01492212 0.01303732 0.00751045 0.01054721 0.01376841 0.01543059
 0.01871786 0.01644405 0.00914726 0.01363889 0.01884516 0.01583206
 0.0210059  0.0184669  0.02341717 0.00924177 0.01126137 0.01105227
 0.01298385 0.02390706 0.03079537 0.00746859 0.00875935 0.00626345
 0.00615548 0.01283387 0.01219278 0.02492504 0.02567796 0.02689747
 0.03


NMI: 0.9224212295124092
Execution: 3 done.
Size of cluster 0: 1113
Size of cluster 1: 0
Size of cluster 2: 1326
Size of cluster 3: 0
Size of cluster 4: 846
Size of cluster 5: 862
Size of cluster 6: 4214
Size of cluster 7: 0
Size of cluster 8: 2420
Size of cluster 9: 1518
Size of cluster 10: 727

Final Mean for cluster 0:
[0.         0.02881755 0.00298333 0.01073627 0.0106522  0.02614821
 0.00413365 0.0327202  0.01193133 0.03042586 0.00424102 0.02321612
 0.01151169 0.02631753 0.00474863 0.0261843  0.01705727 0.03821504
 0.01175688 0.02187554 0.01028742 0.01919043 0.01029362 0.01689177
 0.01503664 0.01626525 0.03607075 0.01340226 0.01262544 0.00943016
 0.01788234 0.00652944 0.01300083 0.00596248 0.01681535 0.01004319
 0.02741747 0.00762374 0.01410359 0.00875538 0.0227985  0.01358113
 0.02849111 0.01041994 0.0297463  0.00512902 0.0133485  0.00760339
 0.0163404  0.01220665 0.03370298 0.00792216 0.019958   0.00457179
 0.00941713 0.00504781 0.00720599 0.01783629 0.03547361 0.01812044
 0.035


NMI: 0.9597544786990011
Execution: 4 done.
Size of cluster 0: 3316
Size of cluster 1: 2959
Size of cluster 2: 0
Size of cluster 3: 1707
Size of cluster 4: 23
Size of cluster 5: 862
Size of cluster 6: 1163
Size of cluster 7: 1192
Size of cluster 8: 1804
Size of cluster 9: 0
Size of cluster 10: 0

Final Mean for cluster 0:
[0.00063333 0.02292879 0.00882935 0.01503459 0.01366881 0.01797731
 0.00719748 0.02846625 0.01843455 0.024202   0.00954406 0.02468799
 0.01634177 0.01988695 0.00928152 0.02388542 0.01834188 0.02853204
 0.0173493  0.01891393 0.01145626 0.01325711 0.0130237  0.01361612
 0.0140685  0.01669364 0.02433725 0.01503916 0.01263718 0.01118519
 0.01318572 0.01133183 0.01014292 0.00934776 0.01408819 0.01248303
 0.02006955 0.01346815 0.01195177 0.01471786 0.01720617 0.01601667
 0.02283441 0.01882503 0.02261569 0.00754977 0.00960034 0.0116583
 0.0138129  0.02252337 0.02773216 0.00952809 0.01518826 0.00611435
 0.00911221 0.00988575 0.00939287 0.02560407 0.02757254 0.02689418
 0.0304


NMI: 0.96967150881765
Execution: 5 done.
Size of cluster 0: 684
Size of cluster 1: 830
Size of cluster 2: 248
Size of cluster 3: 1128
Size of cluster 4: 692
Size of cluster 5: 711
Size of cluster 6: 1384
Size of cluster 7: 1642
Size of cluster 8: 900
Size of cluster 9: 1046
Size of cluster 10: 3761

Final Mean for cluster 0:
[7.78564332e-274 2.86335897e-002 3.17094843e-003 1.07967812e-002
 1.04995057e-002 2.72106869e-002 4.15239370e-003 3.50037152e-002
 1.17834260e-002 3.00746759e-002 4.95115603e-003 2.28397460e-002
 1.15059237e-002 2.73552772e-002 5.34100028e-003 2.76116356e-002
 1.60665929e-002 4.14272251e-002 1.20816534e-002 2.41311388e-002
 9.65304586e-003 1.97041369e-002 8.95364964e-003 1.80498652e-002
 1.45848996e-002 1.50188630e-002 3.64158534e-002 1.20203359e-002
 1.29034645e-002 8.24175498e-003 1.72224490e-002 6.05870052e-003
 1.28544499e-002 5.47465400e-003 1.61965894e-002 9.26531927e-003
 2.76850637e-002 7.81934548e-003 1.45791672e-002 8.39708546e-003
 2.24344594e-002 9.458


NMI: 0.8935367194741988
Execution: 6 done.
Size of cluster 0: 1488
Size of cluster 1: 500
Size of cluster 2: 729
Size of cluster 3: 655
Size of cluster 4: 680
Size of cluster 5: 3761
Size of cluster 6: 454
Size of cluster 7: 1547
Size of cluster 8: 1153
Size of cluster 9: 516
Size of cluster 10: 1543

Final Mean for cluster 0:
[3.41976625e-58 1.91049709e-02 1.56446146e-02 1.78837273e-02
 1.67611522e-02 1.30737317e-02 9.86511196e-03 1.55681248e-02
 2.47552806e-02 1.85780211e-02 1.49031686e-02 2.47920867e-02
 2.13245513e-02 1.40361794e-02 1.27273768e-02 1.86079879e-02
 2.21300970e-02 1.65693206e-02 1.99763137e-02 9.92681295e-03
 1.45366554e-02 1.01064385e-02 1.61982433e-02 7.52071158e-03
 1.37182051e-02 1.91398460e-02 1.44260495e-02 1.88691995e-02
 1.15953449e-02 1.53680279e-02 1.11190540e-02 1.50910161e-02
 7.34150841e-03 1.24100041e-02 1.14035333e-02 1.72411512e-02
 1.44819827e-02 1.84076204e-02 9.02992714e-03 1.95106781e-02
 1.57074188e-02 2.13740273e-02 1.78815010e-02 2.64188783e-02


NMI: 0.8965261217316345
Execution: 7 done.
Size of cluster 0: 1948
Size of cluster 1: 862
Size of cluster 2: 1404
Size of cluster 3: 948
Size of cluster 4: 181
Size of cluster 5: 2208
Size of cluster 6: 1575
Size of cluster 7: 2899
Size of cluster 8: 1001
Size of cluster 9: 0
Size of cluster 10: 0

Final Mean for cluster 0:
[1.43359038e-23 2.11777863e-02 1.15231660e-02 1.72000319e-02
 1.56183500e-02 1.49983013e-02 9.08169375e-03 1.96712570e-02
 2.13586722e-02 2.04268639e-02 1.29125114e-02 2.53335592e-02
 1.91892917e-02 1.56012958e-02 1.09315550e-02 2.11518329e-02
 2.04120990e-02 1.91855055e-02 1.90127334e-02 1.12905090e-02
 1.38763442e-02 1.20446550e-02 1.64311261e-02 8.89658433e-03
 1.40150239e-02 1.71840971e-02 1.57762119e-02 1.79574069e-02
 1.27553928e-02 1.40944928e-02 1.28993134e-02 1.43686192e-02
 8.43653166e-03 1.16057036e-02 1.29939195e-02 1.52831616e-02
 1.61568556e-02 1.67745101e-02 1.02064303e-02 1.72571911e-02
 1.68930482e-02 1.92074620e-02 2.06335456e-02 2.32840782e-02
 2


NMI: 0.9229524794260684
Execution: 8 done.
Size of cluster 0: 283
Size of cluster 1: 2062
Size of cluster 2: 672
Size of cluster 3: 2221
Size of cluster 4: 1347
Size of cluster 5: 2899
Size of cluster 6: 641
Size of cluster 7: 862
Size of cluster 8: 510
Size of cluster 9: 961
Size of cluster 10: 568

Final Mean for cluster 0:
[0.         0.0185265  0.01737991 0.01552514 0.01658595 0.01114394
 0.00828092 0.01214247 0.02506692 0.01807864 0.01417014 0.0237521
 0.02225102 0.01261358 0.01499115 0.01466918 0.02611576 0.01570614
 0.02465231 0.0085303  0.01470266 0.00772434 0.01785777 0.00583456
 0.01210856 0.02395444 0.01419285 0.02546003 0.00884394 0.01643727
 0.01027068 0.01619885 0.00549902 0.01262681 0.01047917 0.02086557
 0.01476881 0.02218071 0.00738143 0.01915341 0.01536947 0.02237983
 0.01502956 0.02963104 0.02313164 0.01113828 0.0093887  0.01142311
 0.00858387 0.03784116 0.0276793  0.00806892 0.00497011 0.00487696
 0.00297459 0.01519377 0.00720386 0.03325886 0.01988426 0.03827803
 0


NMI: 0.892799510633857
Execution: 9 done.
Size of cluster 0: 1441
Size of cluster 1: 2317
Size of cluster 2: 320
Size of cluster 3: 862
Size of cluster 4: 0
Size of cluster 5: 0
Size of cluster 6: 2899
Size of cluster 7: 6
Size of cluster 8: 1760
Size of cluster 9: 2252
Size of cluster 10: 1169

Final Mean for cluster 0:
[1.03530168e-235 2.95803748e-002 2.43938722e-003 1.05088933e-002
 9.93978151e-003 2.62135636e-002 3.34846256e-003 3.49834384e-002
 1.22675138e-002 3.18489387e-002 3.63673178e-003 2.34834182e-002
 1.14513164e-002 2.75731460e-002 4.42521549e-003 2.70841232e-002
 1.73604381e-002 4.04177825e-002 1.12026407e-002 2.30733834e-002
 9.05129062e-003 1.87501667e-002 7.82243973e-003 1.73071007e-002
 1.48988458e-002 1.67299163e-002 3.85107184e-002 1.12012414e-002
 1.13082681e-002 8.58385256e-003 1.79566225e-002 5.40827181e-003
 1.37291945e-002 5.46784648e-003 1.73524101e-002 9.69523121e-003
 2.84575097e-002 7.08050571e-003 1.48391745e-002 8.25090695e-003
 2.32421914e-002 8.8440820


NMI: 0.9445588704418685
Execution: 10 done.


In [24]:
best_index = np.argmax(scores)
print("Best NMI:", scores[best_index])
print()
print("Best Cluster:\n\n", all_clusters[best_index])
print()
print("Best Cluster Centers:\n")
for i in range(len(centers[best_index])):
    print(centers[best_index][i])

Best NMI: 0.9761779028888768

Best Cluster:
 [array([], dtype=float64) array([], dtype=float64)
 array([[0.     , 0.0218 , 0.01357, ..., 0.00391, 0.     , 0.00144],
        [0.     , 0.02245, 0.01619, ..., 0.00261, 0.00157, 0.     ],
        [0.     , 0.01371, 0.00767, ..., 0.     , 0.00044, 0.00131],
        ...,
        [0.     , 0.01302, 0.01576, ..., 0.     , 0.00069, 0.00069],
        [0.     , 0.01314, 0.01729, ..., 0.     , 0.00069, 0.00069],
        [0.     , 0.02821, 0.01727, ..., 0.00058, 0.00115, 0.00115]])
 array([], dtype=float64)
 array([[0.     , 0.02064, 0.02611, ..., 0.00037, 0.00037, 0.00219],
        [0.     , 0.02203, 0.02068, ..., 0.0009 , 0.     , 0.00045],
        [0.     , 0.0079 , 0.01083, ..., 0.00217, 0.00054, 0.00011],
        ...,
        [0.     , 0.0101 , 0.00275, ..., 0.00184, 0.00184, 0.     ],
        [0.     , 0.03272, 0.01674, ..., 0.01142, 0.     , 0.     ],
        [0.     , 0.00944, 0.0181 , ..., 0.     , 0.00039, 0.00079]])
 array([[6.0000e+00, 2