# General k-means analysis
Created by Vanessa King
Initiated May 13, 2021

In [1]:
import math
import time
import random
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib.patches as mpatches
import numpy as np
from numpy import linalg
import scipy.io as sio
from sklearn.cluster import KMeans
from sklearn import metrics
from matplotlib.ticker import FormatStrFormatter


%matplotlib notebook

In [None]:
#Upload data, preprocessed in Matlab. In the format of a '.mat' Matlab matrix

def MATLAB_file_to_array(fileName, variableName):
    #fileName should be string of the full path and fileName
    #variableName should be string of workspace variable name as it was in MATLAB
    
    matrix = sio.loadmat(fileName)
    array = matrix[variableName]
    print("Shape of array: ",np.shape(array))
    
    return array

data_18_77K = MATLAB_file_to_array('/Users/vanessa/Desktop/UBC/Lab/Iridates_Project/Grid006_Rh18_77K.mat', 'my_variable')
data_18_4K = MATLAB_file_to_array('/Users/vanessa/Desktop/UBC/Lab/Iridates_Project/Grid006_Rh18_4K.mat', 'dIdVN')
data_5 = MATLAB_file_to_array('/Users/vanessa/Desktop/UBC/Lab/Iridates_Project/raw_data.mat','raw_data')

In [None]:
def my_kmeans(file, n_clusters):
    #Data needs to be in shape (n_samples, n_features), ie: (75625, 81)
    data = np.reshape(file, (np.shape(file)[0]*np.shape(file)[1], np.shape(file)[2]))
    # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++',n_init=100, max_iter=100, tol=0.00000000000000001, random_state=0).fit(data)
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_
    
    #https://scikit-learn.org/stable/modules/clustering.html#calinski-harabasz-index
    #Calinski-Harabasz Index for quality of fit evaluation. A high value signifies a good fit. Equals BCSS / WCSS
    score = metrics.calinski_harabasz_score(data, labels)
    print('k-means complete. BCSS / WCSS = {:0.6e}'.format(score))
    
    return labels, centroids, score

labels, centroids, score = my_kmeans(data_5, 4)

In [None]:
def kmeans_Plot(grid, minV, maxV, labels, centroids):
    #grid = 3 dimensional data array, minV = starting voltage, maxV = final voltage, 
    #labels = labels output from kmeans, centroids = centroids output from kmeans
    
    energy = np.linspace(minV, maxV, num=np.shape(grid)[2]) #evenly spaced values between minV and maxV
    img_labels = np.reshape(labels, (np.shape(grid)[0],np.shape(grid)[1])) #Reformating labels for map layout
    
    #Plotting
    fig,(ax1,ax2) = plt.subplots(1,2)
    fig.subplots_adjust(hspace=0.3)
    
    ax1.plot(energy,centroids[0],label="Centroid 1", marker='',linestyle='-', lw=2, color='#EF0096')
    ax1.plot(energy,centroids[1],label="Centroid 2", marker='',linestyle='-', lw=2, color='#003C86')
    ax1.plot(energy,centroids[2],label="Centroid 3", marker='',linestyle='-', lw=2, color='k')
    ax1.plot(energy,centroids[3],label="Centroid 4", marker='',linestyle='-', lw=2, color='#00DCB5')

    ax1.set_ylabel("DOS")
    ax1.set_xlabel("Bias (eV)")
    ax1.set_title("Spectra of k-means centroids")
    ax1.legend()
    ax1.yaxis.set_ticks(np.arange(0.0, 1.51E-11, 0.5E-11))
    ax1.set_ylim(0.0, 1.50E-11)
    
    cmap = colors.ListedColormap(['#EF0096','#003C86','k','#00DCB5'])
    ax2.imshow(img_labels, cmap=cmap)
    ax2.set_title("Map of cluster assignments")
    ax2.set_xticks([])
    ax2.set_yticks([])
    
    plt.show()
    
kmeans_Plot(data_5,-0.8,0.8,labels,centroids)

In [None]:
#save to cluster assignments to file

np.savetxt("Rh18_4K_3cluster_labels.csv", img_labels, delimiter=',')