In [1]:
import numpy as np
import pandas as pd
import warnings
import copy
import random
from sklearn.metrics import accuracy_score
from scipy.stats import multivariate_normal
from scipy.special import logsumexp
warnings.filterwarnings("ignore")

# Read In Dataset

In [2]:
df = pd.read_csv("./codon_usage.csv")

In [3]:
df.head()

Unnamed: 0,Kingdom,DNAtype,SpeciesID,Ncodons,SpeciesName,UUU,UUC,UUA,UUG,CUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,vrl,0,100217,1995,Epizootic haematopoietic necrosis virus,0.01654,0.01203,0.0005,0.00351,0.01203,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.0005,0.0
1,vrl,0,100220,1474,Bohle iridovirus,0.02714,0.01357,0.00068,0.00678,0.00407,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.0156,0.0441,0.00271,0.00068,0.0
2,vrl,0,100755,4862,Sweet potato leaf curl virus,0.01974,0.0218,0.01357,0.01543,0.00782,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.0,0.00144
3,vrl,0,100880,1915,Northern cereal mosaic virus,0.01775,0.02245,0.01619,0.00992,0.01567,...,0.00366,0.0141,0.01671,0.0376,0.01932,0.03029,0.03446,0.00261,0.00157,0.0
4,vrl,0,100887,22831,Soil-borne cereal mosaic virus,0.02816,0.01371,0.00767,0.03679,0.0138,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.0,0.00044,0.00131


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13028 entries, 0 to 13027
Data columns (total 69 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Kingdom      13028 non-null  object 
 1   DNAtype      13028 non-null  int64  
 2   SpeciesID    13028 non-null  int64  
 3   Ncodons      13028 non-null  int64  
 4   SpeciesName  13028 non-null  object 
 5   UUU          13028 non-null  object 
 6   UUC          13028 non-null  object 
 7   UUA          13028 non-null  float64
 8   UUG          13028 non-null  float64
 9   CUU          13028 non-null  float64
 10  CUC          13028 non-null  float64
 11  CUA          13028 non-null  float64
 12  CUG          13028 non-null  float64
 13  AUU          13028 non-null  float64
 14  AUC          13028 non-null  float64
 15  AUA          13028 non-null  float64
 16  AUG          13028 non-null  float64
 17  GUU          13028 non-null  float64
 18  GUC          13028 non-null  float64
 19  GUA 

# Drop Faulty Datapoints

In [5]:
# lines 488 and 5065 are index 486 and 5063
df = df.drop([486, 5063], axis = 0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13026 entries, 0 to 13027
Data columns (total 69 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Kingdom      13026 non-null  object 
 1   DNAtype      13026 non-null  int64  
 2   SpeciesID    13026 non-null  int64  
 3   Ncodons      13026 non-null  int64  
 4   SpeciesName  13026 non-null  object 
 5   UUU          13026 non-null  object 
 6   UUC          13026 non-null  object 
 7   UUA          13026 non-null  float64
 8   UUG          13026 non-null  float64
 9   CUU          13026 non-null  float64
 10  CUC          13026 non-null  float64
 11  CUA          13026 non-null  float64
 12  CUG          13026 non-null  float64
 13  AUU          13026 non-null  float64
 14  AUC          13026 non-null  float64
 15  AUA          13026 non-null  float64
 16  AUG          13026 non-null  float64
 17  GUU          13026 non-null  float64
 18  GUC          13026 non-null  float64
 19  GUA 

# Drop unused features

In [7]:
# drop unused features
vals = [2, 3, 4, 5]
df = df.drop(df.columns[vals], axis = 1)

In [8]:
df.head()

Unnamed: 0,Kingdom,DNAtype,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,vrl,0,0.01203,0.0005,0.00351,0.01203,0.03208,0.001,0.0401,0.00551,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.0005,0.0
1,vrl,0,0.01357,0.00068,0.00678,0.00407,0.02849,0.00204,0.0441,0.01153,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.0156,0.0441,0.00271,0.00068,0.0
2,vrl,0,0.0218,0.01357,0.01543,0.00782,0.01111,0.01028,0.01193,0.02283,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.0,0.00144
3,vrl,0,0.02245,0.01619,0.00992,0.01567,0.01358,0.0094,0.01723,0.02402,...,0.00366,0.0141,0.01671,0.0376,0.01932,0.03029,0.03446,0.00261,0.00157,0.0
4,vrl,0,0.01371,0.00767,0.03679,0.0138,0.00548,0.00473,0.02076,0.02716,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.0,0.00044,0.00131


In [9]:
len(df)

13026

In [10]:
y = df.iloc[:, 0]

In [11]:
x = df.iloc[:, 1:]

In [12]:
# 64 dim data points 
x

Unnamed: 0,DNAtype,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,AUC,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,0,0.01203,0.00050,0.00351,0.01203,0.03208,0.00100,0.04010,0.00551,0.02005,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.00050,0.00000
1,0,0.01357,0.00068,0.00678,0.00407,0.02849,0.00204,0.04410,0.01153,0.02510,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.01560,0.04410,0.00271,0.00068,0.00000
2,0,0.0218,0.01357,0.01543,0.00782,0.01111,0.01028,0.01193,0.02283,0.01604,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.00000,0.00144
3,0,0.02245,0.01619,0.00992,0.01567,0.01358,0.00940,0.01723,0.02402,0.02245,...,0.00366,0.01410,0.01671,0.03760,0.01932,0.03029,0.03446,0.00261,0.00157,0.00000
4,0,0.01371,0.00767,0.03679,0.01380,0.00548,0.00473,0.02076,0.02716,0.00867,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.00000,0.00044,0.00131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13023,0,0.03555,0.00547,0.01367,0.01276,0.02097,0.00820,0.03555,0.01459,0.03920,...,0.00820,0.01367,0.01094,0.01367,0.02279,0.02005,0.04102,0.00091,0.00091,0.00638
13024,1,0.03193,0.01984,0.00629,0.01451,0.05322,0.07644,0.01258,0.03096,0.06386,...,0.00145,0.00000,0.00048,0.00194,0.01306,0.01838,0.00677,0.00242,0.00097,0.01887
13025,1,0.03321,0.01661,0.00356,0.01127,0.05042,0.09609,0.01068,0.02728,0.06643,...,0.00000,0.00000,0.00000,0.00178,0.01661,0.02788,0.00297,0.00356,0.00119,0.02017
13026,0,0.02028,0.00767,0.01293,0.01319,0.01959,0.00715,0.03964,0.01600,0.02082,...,0.01142,0.01217,0.01196,0.02178,0.02510,0.02896,0.03959,0.00099,0.00079,0.00156


In [13]:
# ground truth for the clustering based off the kingdom
y

0        vrl
1        vrl
2        vrl
3        vrl
4        vrl
        ... 
13023    pri
13024    pri
13025    pri
13026    pri
13027    pri
Name: Kingdom, Length: 13026, dtype: object

In [14]:
x = x.to_numpy(dtype=float)

In [15]:
x

array([[0.000e+00, 1.203e-02, 5.000e-04, ..., 2.510e-03, 5.000e-04,
        0.000e+00],
       [0.000e+00, 1.357e-02, 6.800e-04, ..., 2.710e-03, 6.800e-04,
        0.000e+00],
       [0.000e+00, 2.180e-02, 1.357e-02, ..., 3.910e-03, 0.000e+00,
        1.440e-03],
       ...,
       [1.000e+00, 3.321e-02, 1.661e-02, ..., 3.560e-03, 1.190e-03,
        2.017e-02],
       [0.000e+00, 2.028e-02, 7.670e-03, ..., 9.900e-04, 7.900e-04,
        1.560e-03],
       [1.000e+00, 3.724e-02, 1.732e-02, ..., 1.560e-03, 1.140e-03,
        2.161e-02]])

In [16]:
y = y.to_numpy()

In [17]:
y

array(['vrl', 'vrl', 'vrl', ..., 'pri', 'pri', 'pri'], dtype=object)

# Get True Clusters

In [30]:
true_clusters = {}
for i in range(len(y)):
    if y[i] not in true_clusters.keys():
        true_clusters[y[i]] = []
        true_clusters[y[i]].append(x[i])
    else:
        true_clusters[y[i]].append(x[i])
    

In [35]:
for key in true_clusters.keys():
    print(len(true_clusters[key]))

2831
126
2919
220
18
2523
1345
2077
572
215
180


# Function for NMI Metric

In [63]:
def NMI(x, clusters, true_clusters):
    T = []
    for key in true_clusters.keys():
        T.append(np.array(true_clusters[key]))
    T = np.array(T)
    
    k = len(clusters)
    
    # get entropy of the predicted clustering
    summ = 0
    for i in range(k):
        pci = len(clusters[i]) / len(x)
        val = pci * np.log(pci)
        summ += val
    # H(C)
    pred_entropy = -1 * summ
    
    # get entropy of the true clustering
    summ = 0
    for i in range(k):
        pti = len(T[i]) / len(x)
        val = pti * np.log(pti)
        summ += val
    # H(T)
    true_entropy = -1 * summ
    
    # get the conditional entropy
    summ = 0
    for i in range(k):
        for j in range(k):
            # get the intersecion between the two clusters
            intersect = 0
            for k in range(len(clusters[i])):
                if clusters[i][k] in T[j]:
                    intersect += 1
            print(intersect)
            
            # get p_ij and p_ci
            pij = intersect / len(x)
            pci = len(clusters[i]) / len(x)
            val = pij * np.log(pij / pci)
            summ += val
    # H(T|C)
    conditional_entropy = -1 * summ
    
    # calculate I(C, T) = H(T) - H(T|C)
    mutual_information = true_entropy - conditional_entropy
    
    # Finally calculate the NMI
    nmi = (mutual_information) / np.sqrt(pred_entropy * true_entropy)
    return nmi
    
            
            

# Expectation-Maximization Algorithm

In [64]:
def EM(x, k, epsilon, true_clusters):
    t = 0
    # initialize
    # first randomly pick k points as the initial cluster centers
    means = []
    for i in range(k):
        val = np.random.randint(0, len(x)-1)
        means.append(x[val])
    means = np.array(means)
    
    # initialize k covariance matrices (d x d matrix)
    covs = []
    for i in range(k):
        c = np.identity(64)
        covs.append(c)
    covs = np.array(covs)
    
    # initialize P(Ci)
    P_Ci = []
    for i in range(k):
        val = 1 / k
        P_Ci.append(val)
    # get P(Ci) into log space
    P_Ci = np.array(P_Ci)
    P_Ci = np.log(P_Ci)
    
    # expectation step
    old_means = copy.deepcopy(means)
    
    while(True):
        
        old_means = copy.deepcopy(means)
        
        # pdf for all i in k
        pdf = np.zeros((len(x), k))
        for i in range(k):
            pdf[:, i] = multivariate_normal.logpdf(x, mean=means[i], cov=covs[i], allow_singular=True)
        
        # get logsumexp
        t1 = pdf + P_Ci
        lse = logsumexp(t1, axis = 1)
        
        
        # get the w matrix
        w = pdf + P_Ci
        for i in range(len(w)):
            w[i] = w[i] - lse[i]
        
        w = np.exp(w)
        
        # Maximization step
        # get new extimate for mean
        for i in range(k):
            denom = np.dot(w[:, i], np.ones((len(w),1)))
            num = np.dot(x.T, w[:, i])
            means[i] = num / denom
            
        # get new cov matrix estimates
        for i in range(k):
            # get centered points 
            x_center = x - means[i]
            # iteratively sum outer products
            new_cov = np.zeros((64, 64))
            for j in range(len(x)):
                outer_prod = np.outer(x_center[j], x_center[j])
                outer_prod = outer_prod * w[j][i]
                new_cov = new_cov + outer_prod
            # divide by the sum of weights
            denom = np.dot(w[:, i], np.ones((len(w),1)))
            
            new_cov = new_cov / denom
            # save the new matrix
            covs[i] = new_cov
        
        # get new prior probabilities
        for i in range(k):
            num = np.dot(w[:, i], np.ones((len(w),1)))
            P_Ci[i] = num / len(x)
        P_Ci = np.log(P_Ci)
        
        t += 1    
        # check break condition
        summ = 0
        for i in range(k):
            diff = means[i] - old_means[i]
            norm2 = np.linalg.norm(diff) ** 2
            summ += norm2
        
        if summ <= epsilon:
            break
    
    # put each point in a cluster
    clusters = []
    for i in range(k):
        clusters.append([])
    for i in range(len(x)):
        point = x[i]
        norms = []
        for j in range(k):
            normal = np.linalg.norm(x[i] - means[j])
            norms.append(normal)
        index = np.argmin(norms)
        clusters[index].append(point)
    
    for i in range(len(clusters)):
        clusters[i] = np.array(clusters[i])
    clusters = np.array(clusters)
    
    for i in range(k):
        print("Size of cluster " + str(i) + ":", len(clusters[i]))
        
    for i in range(k):
        print()
        print("Final Mean for cluster " + str(i) + ":")
        print(means[i])
        print()
        print("Final Covariance Matrix for cluster " + str(i) + ":")
        print(covs[i])
    
    nmi = NMI(x, clusters, true_clusters)
    print()
    print("NMI:")

In [65]:
EM(x, 11, 1e-2, true_clusters)

Size of cluster 0: 2242
Size of cluster 1: 1297
Size of cluster 2: 907
Size of cluster 3: 1976
Size of cluster 4: 0
Size of cluster 5: 1132
Size of cluster 6: 2340
Size of cluster 7: 1992
Size of cluster 8: 278
Size of cluster 9: 862
Size of cluster 10: 0

Final Mean for cluster 0:
[0.01139148 0.01581781 0.02498468 0.01719943 0.01677185 0.00840761
 0.01059375 0.00935558 0.03075287 0.01501784 0.02039529 0.02382842
 0.0242536  0.010363   0.01637931 0.013809   0.02195939 0.01055963
 0.0214686  0.00679249 0.01455608 0.00695653 0.01651872 0.00540188
 0.01204369 0.02102385 0.01002958 0.01830723 0.00936751 0.01730841
 0.00845716 0.01750592 0.00589006 0.01482741 0.00906643 0.01960343
 0.01022574 0.02249742 0.00765191 0.02584089 0.01319039 0.02447042
 0.01440148 0.035067   0.01886403 0.0111498  0.00679762 0.01493263
 0.00891006 0.04209455 0.02519821 0.00930321 0.00461663 0.00548316
 0.00319236 0.01638521 0.00950186 0.03697415 0.01741071 0.03785403
 0.02083667 0.00170707 0.00055475 0.00078289]



2242


In [20]:
co = np.cov(x, rowvar=False, bias=True)

In [21]:
# multivariate_normal.logpdf(x, mean=m, cov=co).shape

In [55]:
z  =np.array([[1, 2], [3, 4], [5, 6]])
zz =np.array([[1, 2], [7, 8], [5, 6]])

In [56]:
for i in range(len(z)):
    if z[i] in zz:
        print("YES")
    else:
        print("NO")

YES
NO
YES
