In [2]:
import numpy as np #numpy and pandas: These libraries are essential for handling and processing data. 
import pandas as pd #numpy is used for numerical operations, and pandas is used for handling data in a tabular format.
from sklearn.model_selection import train_test_split #train_test_split: To divide our dataset into a training set (used to train the model) and a test set (used to evaluate the model).
from sklearn.svm import SVC #SVC and classification_report: To classify the data and evaluate the classification performance.
from sklearn.metrics import classification_report, accuracy_score


# Load the new dataset	
#We need the data to work with. 
#Thyloid.csv contains the gene expression measurements and the stage information of patients.
data = pd.read_csv('Thyloid.csv')  #Reads the dataset from a CSV file into a pandas DataFrame.
print(data.shape) #Prints the shape of the dataset (number of rows and columns).

# Splitting the data into training and testing sets
#We split the data so we can train the model on one part (training set) 
#and test how well it performs on unseen data (testing set). 
#This helps us evaluate the model’s performance realistically.
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) #Splits the dataset into training (80%) and testing (20%) sets.

# storing the labels separately
#The last column contains the labels (stage information), 
#which we need to separate from the features (gene expressions) for training and evaluation purposes.
train_label = train_data.iloc[:, -1]
test_label = test_data.iloc[:, -1]

# removing the last column from the data itself
#Removes the label column from the features.
train_X = train_data.iloc[:, :-1]
test_X = test_data.iloc[:, :-1]

# converting to numpy
#Most machine learning algorithms work efficiently with numpy arrays, so we convert the data for further processing.
train = train_X.to_numpy()
test = test_X.to_numpy()


#Kernel functions: These functions transform the data into a higher-dimensional space to make it easier to find patterns. 
#Different kernels are used based on the data characteristics:

#RBF Kernel: Useful for non-linear relationships.
# Computes the RBF kernel between two data points x and y.
def rbf_kernel(x, y, gamma=1.0):
    """ Radial Basis Function (RBF) Kernel """
    return np.exp(-gamma * np.linalg.norm(x - y) ** 2)

#Polynomial Kernel: Captures polynomial relationships between data points.
#Computes the Polynomial kernel between two data points x and y.
def poly_kernel(x, y, degree=3):
    """ Polynomial Kernel """
    return np.dot(x, y) ** degree

#Linear Kernel: Useful for linear relationships.
# Computes the Linear kernel between two data points x and y.
def linear_kernel(x, y):
    """ Linear Kernel """
    return np.dot(x, y)


#Kernel matrix: This matrix contains the similarity measures between all pairs of data points in the dataset using the chosen kernel. 
#It’s the first step in Kernel PCA, which transforms the data into a space where it’s easier to find patterns.
#Computes the kernel matrix for the given data using the specified kernel type (rbf, poly, or linear) and kernel parameter (e.g., gamma for RBF, degree for Polynomial).
def compute_kernel_matrix(data, kernel_type, kernel_param):
    n_samples = data.shape[0]
    kernel_matrix = np.zeros((n_samples, n_samples))

    for i in range(n_samples):
        for j in range(n_samples):
            if kernel_type == 'rbf':
                kernel_matrix[i, j] = rbf_kernel(data[i], data[j], kernel_param)
            elif kernel_type == 'poly':
                kernel_matrix[i, j] = poly_kernel(data[i], data[j], kernel_param)
            elif kernel_type == 'linear':
                kernel_matrix[i, j] = linear_kernel(data[i], data[j])

    return kernel_matrix, n_samples


#Implements Kernel PCA from scratch.
#Kernel PCA: This is an advanced form of PCA that uses kernel functions to handle non-linear data.
def my_kpca(data, alpha, kernel_type='rbf', kernel_param=1.0):
    kernel_matrix, n_samples = compute_kernel_matrix(data, kernel_type, kernel_param)   #compute_kernel_matrix: Computes the kernel matrix.

    # Ensure the kernel matrix is symmetric
    #Centering the kernel matrix: Ensures the mean of the data is zero in the transformed space.
    kernel_matrix = (kernel_matrix + kernel_matrix.T) / 2 
    
    one_n = np.ones((n_samples, n_samples)) / n_samples #Creates a matrix of ones.
    
    mean2 = one_n.dot(kernel_matrix).dot(one_n)  #Computes the double-centered mean.
    centered_kernel_matrix = kernel_matrix - one_n.dot(kernel_matrix) - kernel_matrix.dot(one_n) + mean2 #Centers the kernel matrix.

    eigen_values, eigen_vectors = np.linalg.eigh(centered_kernel_matrix) #Computes the eigenvalues and eigenvectors of the centered kernel matrix.

    # Ensure eigenvalues and eigenvectors are real
    #Eigenvalues and eigenvectors: 
    #These help in identifying the principal components that capture the most variance in the data.
    eigen_values = np.real(eigen_values)
    eigen_vectors = np.real(eigen_vectors)

    idx = np.argsort(eigen_values)[::-1] #Sorts the eigenvalues in descending order.
    eigen_values = eigen_values[idx]
    eigen_vectors = eigen_vectors[:, idx]
    
    #Variance ratio: 
    #Helps in deciding how many principal components to retain to capture a significant amount of variance 
    #(alpha is the threshold for this).
    #var_ratio and cumulative_var_ratio: Calculates the explained variance and cumulative explained variance.
    var_ratio = eigen_values / np.sum(eigen_values)
    cumulative_var_ratio = np.cumsum(var_ratio)
    k = np.argmax(cumulative_var_ratio >= alpha) + 1 #Determines the number of components to retain to achieve the desired explained variance (alpha).
    reduced_eigen_vectors = eigen_vectors[:, :k]

    projected_data = np.dot(centered_kernel_matrix, reduced_eigen_vectors)

    return projected_data, reduced_eigen_vectors, mean2, k

#Apply KPCA to new data: 
#After training the KPCA model, we need to project new data (test data) into the same transformed space. 
#This ensures consistency in how both training and test data are transformed.

def kPCA_NewData(Y, X, eigVector, type='gaussian', para=1):  #Applies the KPCA transformation to new data using the trained eigenvectors.
    combined_data = np.vstack([Y, X])  #Combines the new data (Y) and the training data (X).
    K_combined, _ = compute_kernel_matrix(combined_data, type, para)   #K_combined: Computes the kernel matrix for the combined data.
    K = K_combined[:Y.shape[0], Y.shape[0]:] #K: Extracts the relevant part of the kernel matrix.

    Z = np.dot(K, eigVector)  #Z: Projects the new data into the reduced-dimensional space using the trained eigenvectors.

    return Z

# Apply KPCA to training data
#Reduces the dimensionality of the training data to a lower-dimensional 
#space where the most important features (principal components) are retained.
projected_train_kpca, reduced_eigen_vectors, K_centered_train, k = my_kpca(train, alpha=0.99, kernel_type='rbf', kernel_param=0.5   #Applies KPCA to the training data with 99% explained variance using an RBF kernel with gamma=0.5.

# Apply KPCA for the test data
#Projects the test data into the same reduced-dimensional space as the training data, 
#ensuring consistency in data representation.
test_reduced = kPCA_NewData(test, train, reduced_eigen_vectors, type='gaussian', para=0.5)

#Helps verify that the transformation has been applied correctly and the dimensions are as expected.
print(test_reduced.shape)

(574, 1882)
(115, 454)
