In [57]:
# import pandas as pd

# Load the dataset to inspect its columns
# file_path = 'Thyloid.csv'
# data = pd.read_csv(file_path)
# print(data.head())
# print(data.columns)

In [71]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

In [76]:
def load_and_impute_data(file_path):
    # Load the dataset
    data = pd.read_csv(file_path)
    
    # Handle missing values by imputing with the mean
    imputer = SimpleImputer(strategy='mean')
    data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
    
    return data_imputed


In [29]:
def normalize_data(data):
    #Normalize the data by subtracting the mean and
    # dividing by the standard deviation
    return (data - data.mean()) / data.std()

In [30]:
def compute_covariance_matrix(data):
    # Compute the covariance matrix
    covariance_matrix = np.cov(data.T)
    return covariance_matrix

In [31]:
def compute_eigenvalues_eigenvectors(covariance_matrix):
    #Compute eigenvalues and Eigenvectors
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    return eigenvalues, eigenvectors

In [32]:
def sort_eigenvalues_eigenvectors(eigenvalues, eigenvectors):
    #Sort the eigenvalues and eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    return eigenvalues, eigenvectors

In [48]:
def project_data (data, eigenvectors, num_components):
    #Project the data onto the selected eigenvectors
    return np.dot(data, eigenvectors[:, :num_components])

In [77]:
def pca_from_scratch(file_path, variance_threshold=0.95):
    # Load and impute the data
    data = load_and_impute_data(file_path)
    labels = data['Label']  
    features = data.drop(columns=['Label'])
    normalized_data = normalize_data(features)

    # Compute the covariance matrix
    covariance_matrix = compute_covariance_matrix(normalized_data)

    # Compute eigenvalues and eigenvectors
    eigenvalues, eigenvectors = compute_eigenvalues_eigenvectors(covariance_matrix)

    # Sort the eigenvalues and eigenvectors
    eigenvalues, eigenvectors = sort_eigenvalues_eigenvectors(eigenvalues, eigenvectors)

    # Determine the number of principal components to retain the desired variance
    total_variance = sum(eigenvalues)
    variance_explained = 0
    num_components = 0
    for eigenvalue in eigenvalues:  # Correct the typo here
        variance_explained += eigenvalue
        num_components += 1
        if variance_explained / total_variance >= variance_threshold:
            break

    # Project the data onto the principal components
    reduced_data = project_data(normalized_data, eigenvectors, num_components)

    return reduced_data, num_components

# Usage
file_path = 'Thyloid.csv'
reduced_data, num_components = pca_from_scratch(file_path)
print(f'Reduced data shape: {reduced_data.shape}')
print(f'Number of components retained: {num_components}')

Reduced data shape: (574, 1881)
Number of components retained: 1881


In [78]:
# PCA using scikit-learn
data = load_and_impute_data(file_path)
labels = data['Label']
features = data.drop(columns=['Label'])

# Normalize the data
normalized_data = normalize_data(features)

# Apply PCA using scikit-learn
pca = PCA(n_components=0.95)  # Retain 95% of variance
reduced_data_sklearn = pca.fit_transform(normalized_data)

print(f'Reduced data shape (sklearn): {reduced_data_sklearn.shape}')
print(f'Explained variance ratio (sklearn): {pca.explained_variance_ratio_}')
print(f'Number of components retained (sklearn): {pca.n_components_}')

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [18]:
from scipy.spatial.distance import pdist, squareform
from scipy.linalg import eigh

def rbf_kernel(X, gamma):
    # Compute the RBF (Gaussian) kernel
    sq_dists = squareform(pdist(X, 'sqeuclidean'))
    K = np.exp(-gamma * sq_dists)
    return K

def kpca(X, kernel, n_components):
    # Center the kernel matrix
    N = X.shape[0]
    K = kernel(X)
    one_n = np.ones((N, N)) / N
    K_centered = K - one_n @ K - K @ one_n + one_n @ K @ one_n

    # Compute the eigenvalues and eigenvectors
    eigenvalues, eigenvectors = eigh(K_centered)
    eigenvalues, eigenvectors = eigenvalues[::-1], eigenvectors[:, ::-1]
    eigenvectors = eigenvectors / np.sqrt(eigenvalues[:n_components])

    # Select the top eigenvectors
    alphas = eigenvectors[:, :n_components]
    lambdas = eigenvalues[:n_components]

    return alphas, lambdas

# Usage for RBF Kernel
gamma = 15
alphas, lambdas = kpca(normalized_data, lambda X: rbf_kernel(X, gamma), num_components)

print(f'KPCA (RBF Kernel) - reduced data shape: {alphas.shape}')

NameError: name 'normalized_data' is not defined

In [19]:
def polynomial_kernel(X, degree, coef0):
    return (X @ X.T + coef0) ** degree

# Usage for Polynomial Kernel
degree = 3
coef0 = 1
alphas_poly, lambdas_poly = kpca(normalized_data, lambda X: polynomial_kernel(X, degree, coef0), num_components)

print(f'KPCA (Polynomial Kernel) - reduced data shape: {alphas_poly.shape}')

NameError: name 'normalized_data' is not defined

In [20]:
def linear_kernel(X):
    return X @ X.T

# Usage for Linear Kernel
alphas_linear, lambdas_linear = kpca(normalized_data, linear_kernel, num_components)

print(f'KPCA (Linear Kernel) - reduced data shape: {alphas_linear.shape}')

NameError: name 'normalized_data' is not defined

In [21]:
def combined_kernel(X, kernel1, kernel2, alpha=0.5):
    return alpha * kernel1(X) + (1 - alpha) * kernel2(X)

# Usage for Combined Kernels
alpha = 0.5
combined_k = combined_kernel(normalized_data, lambda X: rbf_kernel(X, gamma), lambda X: polynomial_kernel(X, degree, coef0), alpha)
alphas_combined, lambdas_combined = kpca(normalized_data, lambda X: combined_k, num_components)

print(f'KPCA (Combined Kernels) - reduced data shape: {alphas_combined.shape}')

NameError: name 'normalized_data' is not defined

In [22]:
def top_features_by_covariance(cov_matrix, top_n=10):
    covariances = np.diag(cov_matrix)
    top_indices = np.argsort(covariances)[-top_n:]
    return top_indices

# Calculate covariance matrix
cov_matrix = compute_covariance_matrix(normalized_data)

# Identify top 10 features with highest covariance
top_features_indices = top_features_by_covariance(cov_matrix)
top_features = features.columns[top_features_indices]

print(f'Top 10 features with highest covariance: {top_features}')

NameError: name 'normalized_data' is not defined

In [23]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(normalized_data, labels, test_size=0.2, random_state=42)

# Transform the test data using PCA and KPCA models
test_data_pca = project_data(test_data, eigenvectors, num_components)
test_data_kpca_rbf = kpca(test_data, lambda X: rbf_kernel(X, gamma), num_components)[0]

# Perform classification using the minimum distance classifier
predictions_pca = myclassifier(train_data, train_labels, test_data_pca)
predictions_kpca_rbf = myclassifier(train_data, train_labels, test_data_kpca_rbf)

# Calculate accuracy
accuracy_pca = calculate_accuracy(test_labels, predictions_pca)
accuracy_kpca_rbf = calculate_accuracy(test_labels,

SyntaxError: incomplete input (1667927020.py, line 16)