### All standard imports for the task
Here you can find all the modules I have used for the task. You can find therse modules where I have applied mentioned as comment. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import Pipeline
from scipy.stats import mode
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import silhouette_samples, silhouette_score
plt.rc('font', size = 11)
sns.set_style('darkgrid')
sns.set_palette(sns.color_palette('Paired'))
sns.palplot(sns.color_palette('Paired'))
sns.set()

### Preprocessing the raw dataset
As the raw data does not hold separate label information and for each class there are 3 replicates, so the labeling of the class was manully done.

In [None]:
#import pandas as pd
dataset_read = pd.read_excel('pilot_experiment_TPM_WTonly.xlsx') #reading the dataset using pandas read_excel function
unprocessed_dataset = dataset_read.iloc[:, 0:].T #Transpose of the raw dataset
y = dataset_read.columns.values.tolist() #getting the label/target labels as list from the dataset 

In [None]:
y_categorized = [] 
sample_list = []
#assigning categorical label to 0-9 (ascending order) to y_categorized as label for each sample
for i in y:
    if i[:-2] == 'Ox_Leaf1_T1':
        y_categorized.append(0)
        sample_list.append('L1T1')
    elif i[:-2] == 'Ox_Leaf1_T2':
        y_categorized.append(1)
        sample_list.append('L1T2')
    elif i[:-2] == 'Ox_Leaf1_T3':
        y_categorized.append(2)
        sample_list.append('L1T3')
    elif i[:-2] == 'Ox_Leaf1_T4':
        y_categorized.append(3)
        sample_list.append('L1T4')
    elif i[:-2] == 'Ox_Leaf3_T2':
        y_categorized.append(4)
        sample_list.append('L3T2')
    elif i[:-2] == 'Ox_Leaf3_T3':
        y_categorized.append(5)
        sample_list.append('L3T3')
    elif i[:-2] == 'Ox_Leaf3_T4':
        y_categorized.append(6)
        sample_list.append('L3T4')
    elif i[:-2] == 'Ox_Leaf5_T3':
        y_categorized.append(7)
        sample_list.append('L5T3')
    elif i[:-2] == 'Ox_Leaf5_T4':
        y_categorized.append(8)
        sample_list.append('L5T4')
    elif i[:-2] == 'Ox_Leaf7_T4':
        y_categorized.append(9)
        sample_list.append('L7T4')

In [None]:
unprocessed_dataset['y'] = y_categorized #putting the label information with the dataset, as the dataset does not contain the label
dataset = unprocessed_dataset
X = dataset.iloc[:, :-1].values #getting the dataset without label, where each row represents sample, each column represents featues or independent variables
y = dataset.iloc[:, -1].values #label column from the data set

#for visualizing purpose(for sample labeling)
#import numpy as np
sample_arr = np.array(sample_list)
set_sample = set(sample_list)
set_sample = sorted(list(set_sample))

### Visualizytion of the data
For the visualization I have used PCA to reduce the dimension of the raw data to two dimension.

In [None]:
#import numpy as np
#import matplotlib.pyplot as plt
#from sklearn.decomposition import PCA
pca_visualization = PCA(n_components = 2)
X_transformed = pca_visualization.fit_transform(X) #transforming my features to 2 dimension
plt.figure(figsize = (14,10))
plt.scatter(X_transformed[:,0], X_transformed[:,1], c = y, s = 50, cmap = 'tab10', alpha = 0.7)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.colorbar()
plt.title('Each color represent each sample set')
#plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/Data.png', dpi = 200)
plt.show()

### Optimal number of clusters
For this task, we know how many classes we have for our data. In the case when we dont have class labels, there are methods to selcet optimal number of clusters. These methods are 
- Elbow Method
- Based on Dendogram
- Silhouette Analysis

### Elbow Method
The Elbow method is a method of interpretation and validation of consistency within cluster (WCSS) analysis designed to help finding the appropriate number of clusters in a dataset.

In [None]:
wcss = []
for i in range(1,11):
    km_elbow = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 100)
    km_elbow.fit(X)
    #Inertia, or the within-cluster sum of squares criterion, can be recognized as a measure of how internally coherent 
    #clusters are.
    wcss.append(km_elbow.inertia_)

plt.figure(figsize = (14,10))
plt.plot(range(1,11), wcss)
plt.plot([4],wcss[3],marker = 'o', markersize = 5, color = 'red', label = 'Maximum Variance')
plt.title('Elbow method')
plt.xlabel('No. of clusters')
plt.ylabel('WCSS')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/Elbow_method_RawData.png', dpi = 200)
plt.show()

### Silhouette Analysis(S. A.)
Silhouette Analysis(S.A.) is a way to measure how close each point in a cluster is to the points in its neighboring clusters. Its a way to find out the optimum value for k during k-means clustering. Silhouette values lies in the range of [-1, 1]. Value of +1 is ideal and -1 is least preferred. Higher the value better is the cluster configuration.

In [None]:
#from sklearn.metrics import silhouette_samples, silhouette_score
avg_sil_score = []
for i in range(2,11):
    kmeans_sil = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 100)
    y_kmeans_sil = kmeans_sil.fit_predict(X)
    centers = kmeans_sil.cluster_centers_

    score = silhouette_score(X, y_kmeans_sil)
    sample_silhouette_values = silhouette_samples(X, y_kmeans_sil)
    print("For n_clusters = {}, silhouette score is {})".format(i, score))
    avg_sil_score.append((score))
    
print("For {}, silhouette score is {})".format('2 Clusters', avg_sil_score[0]))
print("For {}, silhouette score is {})".format('4 Clusters', avg_sil_score[2]))

### Dendogram(for Heirarchical clustering)
The way that heirarchical clustering works is, it maintains a memory about how the clustering was done and that memory stored in a dendogram.
The height of the line connected two observations or two clusters is the euclidean distance between them and also represents the computed dissimilarity between them.
We can set dissimilarity or distance threshold, that tells us that the samples within these clusters are less dissimilar than this threshold. And the way to choose the optimal cluster is selecting the longest vertical line that intersects any extended horizontal lines.

In [None]:
#import scipy.cluster.hierarchy as sch
plt.figure(figsize = (14,10))
dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'), labels = sample_list)
plt.title('Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Euclidean distances')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/Dendogram_RawData.png', dpi = 200)
plt.show()

### Choosing clustering algorithm
We have used the unsupervised clustering approaches where the number of clusters can be predefined as we know the class label.
- KMeans clustering
- Agglomorative clustering
- Spectral Clustering

### Clustering with the optimal number of clusters(K) found from the above mentioned methods
If we dont have the label information, 2 or 4 clusters can be the optimal choice. But for this task we know the label information and we will work with 10 clusters

In [None]:
km_opc = KMeans(n_clusters = 4, init = 'k-means++', max_iter = 300, n_init = 100)
y_km_opc = km_opc.fit_predict(X)

plt.rc('font', size = 10)
fig, ax = plt.subplots(figsize = (14,10))
plt.scatter(X_transformed[:,0], X_transformed[:,1], c = y_km_opc, s = 200, cmap = plt.cm.get_cmap('viridis', 4), alpha = 0.7)
for i, txt in enumerate(sample_list):
    ax.annotate(txt, (X_transformed[:,0][i], X_transformed[:,1][i]))
plt.colorbar(ticks = range(4), label = 'clusters')
plt.clim(0, 3)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/Four_Clusters_RawData.png', dpi = 200)
plt.show()

### KMeans
k-means algorithm searches for a pre-determined number of clusters within an unlabeled multidimensional dataset. It accomplishes this using a simple conception of what the optimal clustering looks like:
- The "cluster center" is the arithmetic mean of all the points belonging to the cluster.
- Each point is closer to its own cluster center than to other cluster centers.

These two assumptions are the basis of the k-means model. There are 6 steps:
1) Randomly select k cluster centers.
2) Calculate the distance between each data point and cluster centers.
3) Assign the data point to the cluster center whose distance from the cluster center is minimum of all the cluster centers.
4) Recalculate the new cluster center.
5) Recalculate the distance between each data point and new obtained cluster centers.
6) If no data point was reassigned then stop, otherwise repeat from step 3).

In [None]:
#from sklearn.cluster import KMeans
#import matplotlib.pyplot as plt
Kmeans = KMeans(n_clusters = 10, init = 'k-means++', max_iter = 300, n_init = 100) #Creating the Kmeans algorith object
y_kmeans = Kmeans.fit_predict(X) #fitting the learning model to the data and predicting the clusters for the samples
plt.rc('font', size = 10) #setting the front size in the plot
fig, ax = plt.subplots(figsize = (14,10))
plt.scatter(X_transformed[:,0], X_transformed[:,1], c = y_kmeans, s = 200, cmap = plt.get_cmap('tab10', 10), alpha = 0.8)
for i, txt in enumerate(sample_list):
    ax.annotate(txt, (X_transformed[:,0][i], X_transformed[:,1][i]))
plt.colorbar(ticks = range(10), label = 'clusters')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/Rawdata_Kmeans.png', dpi = 200)
plt.show()

### Clustering performance evaluation
Evaluating the performance of a clustering algorithm is not as trivial as counting the number of errors or the precision and recall of a supervised classification algorithm. In particular any evaluation metric should not take the absolute values of the cluster labels into account but rather if this clustering define separations of the data similar to some ground truth set of classes or satisfying some assumption such that members belong to the same class are more similar that members of different classes according to some similarity metric.

#### Evaluating the learned Kmeans model by comparing the clustered labels with our actual label y from the confusion matrix.

In [None]:
#from scipy.stats import mode
#from sklearn.metrics import accuracy_score
#from sklearn.metrics import confusion_matrix
#import numpy as np
labels = np.zeros_like(y_kmeans)
for i in range(10):
    mask = (y_kmeans == i)
    labels[mask] = mode(y[mask])[0]

acc_score = accuracy_score(y, labels)
print('The accuracy score for optimized K-means algorithm {}.'.format(acc_score))
mat = confusion_matrix(y, labels)
sns.set(rc={'figure.figsize':(12,8)})
sns.heatmap(mat.T, square = False, annot = True, fmt = 'd', cbar = False, xticklabels = set_sample, yticklabels = set_sample)
plt.xlabel('True label')
plt.ylabel('Predicted label')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Kmeans/ConfusionMatrix_RawData.png', dpi = 200)

#### The confusion matrix does not necessarily gives the right accuarcy measure.
If you look at L3_T2 samples, 2 samples were clustered in one cluster and 1 sample was clustered as separate cluster. But in the confusion matrix L3_T2 were shown as these 3 samples were correctly clustered. And the´L7_T4 2 samples were clustered in the L3_T2 cluster. And L3_T3 and L5_T4 were also clustered in the same cluster.
So for unsupervised method based on unlabeled data, the evaluation metrics does not always perform well. As for our data, we have less samples(30). So it is possible to manually evaluate the Clustering method performance by visualizing the clustering result with reduced dimension by labeling each sample based on their clustered label.

In [None]:
from sklearn import metrics
#the Adjusted Rand index is a function that measures the similarity of the two assignments, 
#ignoring permutations and with chance normalization.
#Perfect labeling is scored 1.0, Bad labeling have negative or close to 0.0 scores.
adjusted_rand_score = metrics.adjusted_rand_score(y, y_kmeans)
print('Adjusted rand score {}.'.format(adjusted_rand_score))

#the Mutual Information is a function that measures the agreement of the two assignments. 
#Two different normalized versions of this measure are available, Normalized Mutual Information (NMI) and 
#Adjusted Mutual Information (AMI). NMI is often used in the literature, while AMI was proposed more recently 
#and is normalized against chance
adjusted_mutual_info = metrics.adjusted_mutual_info_score(y, y_kmeans)
print('Adjusted mutual info score {}.'.format(adjusted_mutual_info))

normalized_mutual_info = metrics.normalized_mutual_info_score(y, y_kmeans)
print('Normalized mutual info score {}.'.format(normalized_mutual_info))
print('AMI and NMI values close to zero indicate two label assignments that are largely independent, while values close to one indicate significant agreement. Further, an AMI of exactly 1 indicates that the two label assignments are equal ')

#Homogeneity, completeness and V-measure
#homogeneity: each cluster contains only members of a single class.
#completeness: all members of a given class are assigned to the same cluster.
homogeneity_score, completeness_score, V_score = metrics.homogeneity_completeness_v_measure(y, y_kmeans)
print('Homogeneity score {}.'.format(homogeneity_score))
print('Completeness info score {}.'.format(completeness_score))
print('V-measure score {}.'.format(V_score))



###  For smaller sample sizes or larger number of clusters it is safer to use an adjusted index such as the Adjusted Rand Index (ARI).
For the following tasks we will use Adjusted_rand_score, Homogeneity and completeness score as the performance evaluation.

### Heiarchical Clustering
The AgglomerativeClustering object performs a hierarchical clustering using a bottom up approach: each observation starts in its own cluster, and clusters are successively merged together. The linkage criteria determines the metric used for the merge strategy:
- Ward: minimizes the sum of squared differences within all clusters. It is a variance-minimizing approach and in this sense is similar to the k-means objective function but tackled with an agglomerative hierarchical approach.

In [None]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 10, affinity = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(X)

plt.rc('font', size = 10) #setting the front size in the plot
fig, ax = plt.subplots(figsize = (14,10)) #figure size
plt.scatter(X_transformed[:,0], X_transformed[:,1], c = y_hc, s = 200, cmap = plt.get_cmap('tab10', 10), alpha = 0.8)
for i, txt in enumerate(sample_list):
    ax.annotate(txt, (X_transformed[:,0][i], X_transformed[:,1][i]))
plt.colorbar(ticks = range(10), label = 'clusters')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/Rawdata_Heiarchical_Clustering.png', dpi = 200)
plt.show()

### Spectral Clustering
It uses the graph of nearest neighbors to compute a higher-dimensional representation of the data, and then assigns labels using a k-means algorithm.
In practice Spectral Clustering is very useful when the structure of the individual clusters is highly non-convex or more generally when a measure of the center and spread of the cluster is not a suitable description of the complete cluster. 

In [None]:
#from sklearn.cluster import SpectralClustering
SCluster = SpectralClustering(n_clusters = 10, n_init = 300, affinity = 'rbf', assign_labels = 'discretize')
y_SC = SCluster.fit_predict(X)

plt.rc('font', size = 10) #setting the front size in the plot
fig, ax = plt.subplots(figsize = (14,10)) #figure size
plt.scatter(X_transformed[:,0], X_transformed[:,1], c = y_SC, s = 200, cmap = plt.get_cmap('tab10', 10), alpha = 0.8)
for i, txt in enumerate(sample_list):
    ax.annotate(txt, (X_transformed[:,0][i], X_transformed[:,1][i]))
plt.colorbar(ticks = range(10), label = 'clusters')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/Rawdata_Spectral_Clustering.png', dpi = 200)
plt.show()

KMeans and Heirarchical Clustering produce the same result and both of the clustering approaches almost work in the same way. As for spectral clustering, the output is not desirable. As our task for unsupervised approach is to find if there is any pattern in the data, for the following task we will be using KMeans clustering algorithm.

### Applying KMeans in scaled features(Normalized)
The dataset contains features highly varying in magnitudes and range. But since, most of the machine learning algorithms use Eucledian distance between two data points in their computations, this is a problem.The features with high magnitudes will weigh in a lot more in the distance calculations than features with low magnitudes.
To supress this effect, we need to bring all features to the same level of magnitudes. This can be acheived by scaling.
Standardisation can be used for algorithms that assumes zero centric data like Principal Component Analysis(PCA).

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc_X_train = sc.fit_transform(X)

### Visualization of the normalized data

In [None]:
pca = PCA(.9999) #retaining 99.99 variaance of the data
X_trans = pca.fit_transform(sc_X_train)
num_components = pca.n_components_
print('{} components retain 99.99% variance of the data and shape {}.'.format(num_components, X_trans.shape))

plt.figure(figsize = (14,10))
plt.scatter(X_trans[:,0], X_trans[:,1], c = y, s = 50, cmap = 'tab10', alpha = 0.7)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.colorbar()
plt.title('Normalized raw data Visualization')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/rawData_Normalized.png', dpi = 200)
plt.show()

### KMeans on normalized data

In [None]:
km_nr = KMeans(n_clusters = 10, init = 'k-means++', max_iter = 300, n_init = 100)
y_km_nr = km_nr.fit_predict(sc_X_train)

plt.rc('font', size = 10) #setting the front size in the plot
fig, ax = plt.subplots(figsize = (14,10))
plt.scatter(X_trans[:,0], X_trans[:,1], c = y_km_nr, s = 200, cmap = plt.get_cmap('tab10', 10), alpha = 0.8)
for i, txt in enumerate(sample_list):
    ax.annotate(txt, (X_trans[:,0][i], X_trans[:,1][i]))
plt.colorbar(ticks = range(10), label = 'clusters')
plt.title('KMeans on the normalized data')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/Kmeans_normalized_Rawdata.png', dpi = 200)
plt.show()

### Evaluation of the Kmeans on normalized data
- Adjusted random score
- Homogeneity score
- Completeness score

In [None]:
adjusted_rand_score = metrics.adjusted_rand_score(y, y_km_nr)
print('Adjusted rand score {}.'.format(adjusted_rand_score))

homogeneity_score, completeness_score, V_score = metrics.homogeneity_completeness_v_measure(y, y_km_nr)
print('Homogeneity score {}.'.format(homogeneity_score))
print('Completeness info score {}.'.format(completeness_score))


As we can see that doing feature scaling or Standardization improve the accuracy of the KMeans clustering output. So for the following task feature normalization is preffered.

### Applying PCA to reduce the dimensionality
- Linear dimensionality reduction using PCA
- Non-linear dimensionality reduction using KernelPCA

### Applying PCA on RawData

In [None]:
pca_rawdata = PCA(.9999)
X_transform_rd = pca_rawdata.fit_transform(X)
num_components = pca_rawdata.n_components_
print('{} components retain 99.99% variance of the data and shape {}.'.format(num_components, X_transform_rd.shape))

km_pca_rd = KMeans(n_clusters = 10, init = 'k-means++', max_iter = 300, n_init = 100)
y_pca_rd = km_pca_rd.fit_predict(X_transform_rd)

plt.rc('font', size = 10) #setting the front size in the plot
fig, ax = plt.subplots(figsize = (14,10))
plt.scatter(X_transform_rd[:,0], X_transform_rd[:,1], c = y_pca_rd, s = 200, cmap = plt.get_cmap('tab10', 10), alpha = 0.7)
for i, txt in enumerate(sample_list):
    ax.annotate(txt, (X_transform_rd[:,0][i], X_transform_rd[:,1][i]))
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.colorbar()
plt.title('KMeans with reduced dimension of raw data')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/rawData_PCA.png', dpi = 200)
plt.show()

Reducing the dimension of raw data using PCA does not change the output of the cluster and does not improve the performance. So linear PCA does not improve the accuracy.

### KernalPCA : Non-linear dimensionality reduction through the use of kernels
kernel trick, a method to project original data into higher dimension without sacrificing too much computational time. (Non-linear feature mapping). The basic idea to deal with linearly inseparable data is to project it onto a higher dimensional space where it becomes linearly separable.
The “classic” PCA approach is a linear projection technique that works well if the data is linearly separable. However, in the case of linearly inseparable data, a nonlinear technique is required if the task is to reduce the dimensionality of a dataset.

### KernelPCA on rawdata

In [None]:
kpca_rd = KernelPCA(n_components = 29, kernel = 'rbf')
X_transform_rd = kpca_rd.fit_transform(X)

km_kpca_rd = KMeans(n_clusters = 10, init = 'k-means++', max_iter = 300, n_init = 100)
y_kpca_rd = km_kpca_rd.fit_predict(X_transform_rd)

plt.rc('font', size = 10) #setting the front size in the plot
fig, ax = plt.subplots(figsize = (14,10))
plt.scatter(X_transform_rd[:,0], X_transform_rd[:,1], c = y_kpca_rd, s = 200, cmap = plt.get_cmap('tab10', 10), alpha = 0.7)
for i, txt in enumerate(sample_list):
    ax.annotate(txt, (X_transform_rd[:,0][i], X_transform_rd[:,1][i]))
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.colorbar(label = 'Clusters')
plt.title('KMeans with reduced dimension of raw data with KPCA')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/rawData_KPCA.png', dpi = 200)
plt.show()

We can come to the conclusion that either linear or kernel PCA do not perform well on the raw data.

### Applying linear PCA on normalized data using StandardScalar(0 mean, 1 std.)

In [None]:
pca_nd = PCA(.9999)
X_transform_nd = pca_nd.fit_transform(sc_X_train)
num_components = pca_nd.n_components_
print('{} components retain 99.99% variance of the data and shape {}.'.format(num_components, X_transform_nd.shape))

km_pca_nd = KMeans(n_clusters = 10, init = 'k-means++', max_iter = 300, n_init = 100)
y_pca_nd = km_pca_nd.fit_predict(X_transform_nd)

plt.rc('font', size = 10) #setting the front size in the plot
fig, ax = plt.subplots(figsize = (14,10))
plt.scatter(X_transform_nd[:,0], X_transform_nd[:,1], c = y_pca_nd, s = 200, cmap = plt.get_cmap('tab10', 10), alpha = 0.7)
for i, txt in enumerate(sample_list):
    ax.annotate(txt, (X_transform_nd[:,0][i], X_transform_nd[:,1][i]))
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.colorbar()
plt.title('KMeans with reduced dimension of raw data')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/normalizedData_PCA.png', dpi = 200)
plt.show()

### Applying non-linear KernelPCA on normalized data using StandardScalar(0 mean, 1 std.)

In [None]:
kpca_nd = KernelPCA(n_components = 5, kernel = 'rbf')
X_transform_nd = kpca_nd.fit_transform(sc_X_train)

km_kpca_nd = KMeans(n_clusters = 10, init = 'k-means++', max_iter = 300, n_init = 100)
y_kpca_nd = km_kpca_nd.fit_predict(X_transform_nd)

plt.rc('font', size = 10) #setting the front size in the plot
fig, ax = plt.subplots(figsize = (14,10))
plt.scatter(X_transform_nd[:,0], X_transform_nd[:,1], c = y_kpca_nd, s = 200, cmap = plt.get_cmap('tab10', 10), alpha = 0.7)
for i, txt in enumerate(sample_list):
    ax.annotate(txt, (X_transform_nd[:,0][i], X_transform_nd[:,1][i]))
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.colorbar(label = 'Clusters')
plt.title('KMeans with reduced dimension of raw data with KPCA')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/normalisedData_KPCA.png', dpi = 200)
plt.show()

In [None]:
kpca = KernelPCA(kernel = 'rbf')
X_transformed = kpca.fit_transform(sc_X_train)

explained_variance = np.var(X_transformed, axis=0)
explained_variance_ratio = explained_variance / np.sum(explained_variance)
pca_components_var = np.cumsum(explained_variance_ratio)
num_components = 0
for i in pca_components_var:
    if i <= 0.9999:
        num_components = num_components + 1
    else:
        break
print('By using KernelPCA only 40% variance of the data kept(5 KernelPCA components) that gives better KMeans accuracy. ')

Only 5 KernelPCA components were kept from 29 components. 5 components hold 40% variance of the data, to cluster the samples of each class in their own cluster. When using all the components to keep 99.99% variance, KMeans cannot cluster the samples accurately.

In [None]:
adjusted_rand_score = metrics.adjusted_rand_score(y, y_kpca_nd)
print('Adjusted rand score for KMeans using KernelPCA {}.'.format(adjusted_rand_score))

homogeneity_score, completeness_score, V_score = metrics.homogeneity_completeness_v_measure(y, y_kpca_nd)
print('Homogeneity score for KMeans using KernelPCA {}.'.format(homogeneity_score))
print('Completeness info score for KMeans using KernelPCA {}.'.format(completeness_score))

### Findings
Applying KernelPCA improves the accuracy of the KMeans clustering method. From this finding we can say:
- The data is linearly inseparable
- Feature Scaling has profound effect on the output and accuracy
- By reducing the dimension by non-linear PCA gives better accuracy
- Only 41% variance of the data gives KMeans better clustering accuracy

### Validating the result by using Supervised dimensionality reduction technique - LDA


The general LDA approach is very similar to a Principal Component Analysis, but in addition to finding the component axes that maximize the variance of our data (PCA), we are additionally interested in the axes that maximize the separation between multiple classes (LDA).
The goal of an LDA is to project a feature space (a dataset n-dimensional samples) onto a smaller subspace k (where k≤n−1) while maintaining the class-discriminatory information. 

### LDA on normalized data to reduce the dimensionality and applying KMeans

In [None]:
lda = LDA(n_components = 8)
X_transform_lda = lda.fit_transform(sc_X_train, y)
lda.fit(X_transformed, y)

Km_lda = KMeans(n_clusters = 10, init = 'k-means++', max_iter = 300, n_init = 100)
y_lda = Km_lda.fit_predict(X_transformed)

plt.rc('font', size = 10) #setting the front size in the plot
fig, ax = plt.subplots(figsize = (14,10))
plt.scatter(X_transform_lda[:,0], X_transform_lda[:,1], c = y_lda, s = 200, cmap = plt.get_cmap('tab10', 10), alpha = 0.7)
for i, txt in enumerate(sample_list):
    ax.annotate(txt, (X_transform_lda[:,0][i], X_transform_lda[:,1][i]))
plt.xlabel('LDA C1')
plt.ylabel('LDA C2')
plt.colorbar()
plt.title('KMeans with reduced dimension of normalized data by LDA')
plt.savefig('C:/Users/Tamal/Documents/Thesis Files/Images/Report/normalizedData_LDA.png', dpi = 200)
plt.show()

### As we can see, even taking the class or label information, linear separation of the data does not help to improve accuracy

### LDA as a supervised learning (as Classifier)
This was done just to see how the LDA works as a classifier and showing the output with different n_components.

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

acc1 = []
for i in range(1, 10):
    lda = LDA(n_components = i)
    X_transformed = lda.fit_transform(sc_X_train, y)
    lda.fit(X_transformed, y)
    y_pred = lda.predict(X_transformed)
    acc1.append(accuracy_score(y, y_pred))
    print('Accuracy score {} for {} LDA components.'.format(accuracy_score(y, y_pred), i))

### Making a pipeline for predicting new sample with: StandardScalar + KernelPCA + KMeans 
The following script is for test sample

In [None]:
def predict_new_label(new_sample_list):
    pipeline = (Pipeline([('Scalar', StandardScaler()), ('KernelPCA', KernelPCA(n_components = 5, kernel = 'rbf')), 
                          ('Kmeans', KMeans(n_clusters = 10, n_init = 100, max_iter = 300, init = 'k-means++'))]))
    y_train = pipeline.fit_predict(X)
    y_new_sample = pipeline.predict(new_sample_list)
    
    return y_train, y_new_sample

In [None]:
#Put sample list lable to visualize the label of the test sample
test_sample_label = ['test_Sample_1', 'test_Sample_2']
sc_ = StandardScaler()
sc_X = sc_.fit_transform(X)
#put new sample or sample list here
test_sample_ = 

kpca_ = KernelPCA(n_components = 5, kernel = 'rbf')
X_transform_ = kpca_.fit_transform(sc_X)
test_sample_trans_ = kpca_.transform(test_sample_)

km_kpca_ = KMeans(n_clusters = 10, init = 'k-means++', max_iter = 300, n_init = 100)
y_train_ = km_kpca_.fit_predict(X_transform_)
y_test_ = km_kpca_.predict(test_sample_)

plt.rc('font', size = 10) #setting the front size in the plot
fig, ax = plt.subplots(figsize = (14,10))
plt.scatter(X_transform_[:,0], X_transform_[:,1], c = y_train_, s = 200, cmap = plt.get_cmap('tab10', 10), alpha = 0.7)
plt.scatter(X_transform_[:,0], X_transform_[:,1], c = y_test_, s = 200, cmap = plt.get_cmap('tab10', 10), alpha = 0.7)

for i, txt in enumerate(sample_list):
    ax.annotate(txt, (X_transform_[:,0][i], X_transform_[:,1][i]))
    
for j, txt in enumerate(test_sample_label):
    ax.annotate(txt, (test_sample_trans_[:,0][j], test_sample_trans_[:,1][j]))
    
plt.xlabel('LDA C1')
plt.ylabel('LDA C2')
plt.colorbar()
plt.title('KMeans with reduced dimension of normalized data by KPCA')
plt.show()