In [40]:
import pandas as pd

# Load the dataset
file_path = 'Thyloid.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())

   hsa-let-7a-1  hsa-let-7a-2  hsa-let-7a-3    hsa-let-7b   hsa-let-7c  \
0  14316.183948  14351.995128  14412.983601  12203.480690  6594.417741   
1  18042.893723  17822.176422  18024.997726   8962.166370  3304.048522   
2  23542.996389  23403.850012  23393.390262  10325.675423  6358.260398   
3  35319.514700  35344.502424  35235.536971   7294.517488  5175.621787   
4  10786.292006  10704.600259  10960.408721   4716.654860  3542.559611   

    hsa-let-7d   hsa-let-7e  hsa-let-7f-1  hsa-let-7f-2   hsa-let-7g  ...  \
0   798.167093  2007.302666   7279.990730   7366.938399   861.970419  ...   
1  3772.327121  4741.693647   5520.915201   5780.407164   472.006932  ...   
2  1129.019119  4283.426272  16578.070468  16638.927197  1227.594343  ...   
3   950.956848  6652.902332  35001.632900  35642.299155  2407.045273  ...   
4  3325.510007  2781.097126   8194.218848   8312.284292  2685.094417  ...   

   hsa-mir-942  hsa-mir-943  hsa-mir-944  hsa-mir-95  hsa-mir-9500  \
0     2.032949     0.0

In [41]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

def load_and_impute_data(file_path):
    # Load the dataset
    data = pd.read_csv(file_path)
    
    # Separate labels from features
    labels = data['Label']
    features = data.drop(columns=['Label'])
    
    # Handle missing values by imputing with the mean
    imputer = SimpleImputer(strategy='mean')
    features_imputed = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)
    
    # Debug: Check for NaNs after imputation
    if features_imputed.isnull().sum().sum() > 0:
        raise ValueError("NaN values found in imputed data")
    
    # Combine imputed features with labels
    data_imputed = pd.concat([features_imputed, labels], axis=1)
    
    return data_imputed

def normalize_data(features):
    # Remove columns with zero variance
    non_zero_variance_features = features.loc[:, features.var() != 0]
    
    # Normalize the data by subtracting the mean and dividing by the standard deviation
    features_normalized = (non_zero_variance_features - non_zero_variance_features.mean()) / non_zero_variance_features.std()
    
    # Debug: Check for NaNs after normalization and identify columns with NaNs
    nan_columns = features_normalized.columns[features_normalized.isnull().any()].tolist()
    if nan_columns:
        raise ValueError(f"NaN values found in normalized data for columns: {nan_columns}")
    
    return features_normalized

def compute_covariance_matrix(data):
    # Compute the covariance matrix
    covariance_matrix = np.cov(data.T)
    return covariance_matrix

def compute_eigenvalues_eigenvectors(covariance_matrix):
    # Compute eigenvalues and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    return eigenvalues, eigenvectors

def sort_eigenvalues_eigenvectors(eigenvalues, eigenvectors):
    # Sort the eigenvalues and eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    return eigenvalues, eigenvectors

def project_data(data, eigenvectors, num_components):
    # Project the data onto the selected eigenvectors
    return np.dot(data, eigenvectors[:, :num_components])

def pca_from_scratch(file_path, variance_threshold=0.95):
    # Load and impute the data
    data = load_and_impute_data(file_path)
    labels = data['Label']  # Correctly using 'Label' as the column name for the labels
    features = data.drop(columns=['Label'])
    normalized_data = normalize_data(features)

    # Compute the covariance matrix
    covariance_matrix = compute_covariance_matrix(normalized_data)

    # Compute eigenvalues and eigenvectors
    eigenvalues, eigenvectors = compute_eigenvalues_eigenvectors(covariance_matrix)

    # Sort the eigenvalues and eigenvectors
    eigenvalues, eigenvectors = sort_eigenvalues_eigenvectors(eigenvalues, eigenvectors)

    # Determine the number of principal components to retain the desired variance
    total_variance = sum(eigenvalues)
    variance_explained = 0
    num_components = 0
    for eigenvalue in eigenvalues:
        variance_explained += eigenvalue
        num_components += 1
        if variance_explained / total_variance >= variance_threshold:
            break

    # Project the data onto the principal components
    reduced_data = project_data(normalized_data, eigenvectors, num_components)

    return reduced_data, num_components

# Usage
file_path = 'Thyloid.csv'
reduced_data, num_components = pca_from_scratch(file_path)
print(f'Reduced data shape: {reduced_data.shape}')
print(f'Number of components retained: {num_components}')

# PCA using scikit-learn
data = load_and_impute_data(file_path)
labels = data['Label']
features = data.drop(columns=['Label'])

# Normalize the data
normalized_data = normalize_data(features)

# Verify no NaNs in the normalized data (Already handled inside normalize_data function)

# Apply PCA using scikit-learn
pca = PCA(n_components=0.95)  # Retain 95% of variance
reduced_data_sklearn = pca.fit_transform(normalized_data)

print(f'Reduced data shape (sklearn): {reduced_data_sklearn.shape}')
print(f'Explained variance ratio (sklearn): {pca.explained_variance_ratio_}')
print(f'Number of components retained (sklearn): {pca.n_components_}')

Reduced data shape: (574, 400)
Number of components retained: 400
Reduced data shape (sklearn): (574, 400)
Explained variance ratio (sklearn): [0.04777769 0.03725629 0.03427611 0.0265207  0.02182789 0.01832401
 0.01549001 0.01450249 0.01206209 0.01151309 0.01066848 0.00945816
 0.00923541 0.00881043 0.00846386 0.00774428 0.00758392 0.00717494
 0.00702501 0.0065385  0.00632394 0.00597568 0.00595614 0.00566321
 0.00527935 0.00509829 0.00503669 0.00479993 0.0047587  0.00471187
 0.00456325 0.004299   0.00428154 0.00423483 0.00418563 0.00408096
 0.00406631 0.0040213  0.00392481 0.0038951  0.0037937  0.00374534
 0.00368772 0.00366683 0.00358775 0.0035479  0.00349782 0.00346277
 0.00340288 0.00338083 0.00333573 0.00330953 0.00327049 0.00323506
 0.00321189 0.00318098 0.00316745 0.00313947 0.00308969 0.00306535
 0.00303164 0.00301099 0.00297727 0.00294546 0.0029335  0.0029102
 0.00286837 0.00286819 0.00284626 0.00283578 0.00280194 0.0027964
 0.00277202 0.0027444  0.00272778 0.00271292 0.00267641

In [35]:
from sklearn.metrics.pairwise import rbf_kernel
from scipy.linalg import eigh

def kpca_rbf(X, gamma, n_components):
    # Compute the RBF kernel matrix
    K = rbf_kernel(X, gamma=gamma)
    
    # Center the kernel matrix
    N = K.shape[0]
    one_n = np.ones((N, N)) / N
    K_centered = K - one_n @ K - K @ one_n + one_n @ K @ one_n
    
    # Compute eigenvalues and eigenvectors
    eigenvalues, eigenvectors = eigh(K_centered)
    
    # Sort eigenvalues and eigenvectors in descending order
    idx = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]
    
    # Select the top n_components eigenvectors (principal components)
    X_kpca = np.dot(K_centered, eigenvectors[:, :n_components])
    
    return X_kpca

# Usage example
gamma = 0.1
n_components = 10
X_kpca_rbf = kpca_rbf(normalized_data, gamma, n_components)
print(f'Reduced data shape (KPCA with RBF): {X_kpca_rbf.shape}')

ValueError: Input contains NaN.

In [36]:
from sklearn.metrics.pairwise import rbf_kernel
from scipy.linalg import eigh

def polynomial_kernel(X, degree, coef0):
    return (X @ X.T + coef0) ** degree

def combined_kernel(X, kernel1, kernel2, alpha=0.5):
    return alpha * kernel1(X) + (1 - alpha) * kernel2(X)

def kpca(X, kernel_func, n_components):
    # Compute the kernel matrix
    K = kernel_func(X)
    
    # Center the kernel matrix
    N = K.shape[0]
    one_n = np.ones((N, N)) / N
    K_centered = K - one_n @ K - K @ one_n + one_n @ K @ one_n
    
    # Compute eigenvalues and eigenvectors
    eigenvalues, eigenvectors = eigh(K_centered)
    eigenvalues, eigenvectors = eigenvalues[::-1], eigenvectors[:, ::-1]
    
    # Select the top n_components eigenvectors (alphas) and eigenvalues (lambdas)
    alphas = eigenvectors[:, :n_components]
    lambdas = eigenvalues[:n_components]
    
    # Normalize the eigenvectors (alphas)
    for i in range(n_components):
        alphas[:, i] = alphas[:, i] / np.sqrt(lambdas[i])
    
    return alphas, lambdas

# Usage for Combined Kernels
gamma = 0.1
degree = 3
coef0 = 1
alpha = 0.5
num_components = 10 

# Define the combined kernel function
def combined_kernel_func(X):
    return combined_kernel(X, lambda X: rbf_kernel(X, gamma=gamma), lambda X: polynomial_kernel(X, degree, coef0), alpha)

# Perform KPCA with the combined kernel
alphas_combined, lambdas_combined = kpca(normalized_data, combined_kernel_func, num_components)

print(f'KPCA (Combined Kernels) - reduced data shape: {alphas_combined.shape}')

ValueError: Input contains NaN.

In [37]:
def linear_kernel(X):
    return X @ X.T

# Usage for Linear Kernel
alphas_linear, lambdas_linear = kpca(normalized_data, linear_kernel, num_components)

print(f'KPCA (Linear Kernel) - reduced data shape: {alphas_linear.shape}')

ValueError: array must not contain infs or NaNs

In [24]:
def top_features_by_covariance(cov_matrix, top_n=10):
    covariances = np.diag(cov_matrix)
    top_indices = np.argsort(covariances)[-top_n:]
    return top_indices

# Calculate covariance matrix
cov_matrix = compute_covariance_matrix(normalized_data)

# Identify top 10 features with highest covariance
top_features_indices = top_features_by_covariance(cov_matrix)
top_features = features.columns[top_features_indices]

print(f'Top 10 features with highest covariance: {top_features}')

Top 10 features with highest covariance: Index(['hsa-mir-422a', 'hsa-mir-4464', 'hsa-mir-3186', 'hsa-mir-299',
       'hsa-mir-4321', 'hsa-mir-301a', 'hsa-mir-494', 'hsa-mir-4771-2',
       'hsa-mir-1245a', 'hsa-mir-3606'],
      dtype='object')


In [39]:
import numpy as np

# Function to calculate the distance between two points
def dis(x1, x2):
    return np.linalg.norm(x1 - x2)

# Function to perform classification
def myclassifier(train_data, train_labels, test_data):
    pred = []

    for testpoint in test_data:
        pred_dis = []
        for trainpoint in train_data:
            pred_dis.append(dis(testpoint, trainpoint))

        pred.append(train_labels[np.argmin(pred_dis)])

    return np.array(pred)

# Function to calculate accuracy
def calculate_accuracy(true_labels, predicted_labels):
    # Ensure that the true labels and predicted labels have the same length
    if len(true_labels) != len(predicted_labels):
        raise ValueError("Length of true_labels and predicted_labels must be the same.")

    # Count the number of correct predictions
    correct_predictions = sum(1 for true, predicted in zip(true_labels, predicted_labels) if true == predicted)

    # Calculate accuracy as the ratio of correct predictions to total predictions
    accuracy = correct_predictions / len(true_labels)

    return accuracy

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# Split the data into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(normalized_data, labels, test_size=0.2, random_state=42)

# Apply PCA on the training data using scikit-learn
pca = PCA(n_components=0.95)
train_data_pca = pca.fit_transform(train_data)
test_data_pca = pca.transform(test_data)

# Apply KPCA on the training data
gamma = 0.1
degree = 3
coef0 = 1
alpha = 0.5
num_components = 10  # Adjust this number based on your needs

# KPCA with RBF Kernel
train_data_kpca_rbf, _ = kpca(train_data, lambda X: rbf_kernel(X, gamma=gamma), num_components)
test_data_kpca_rbf, _ = kpca(test_data, lambda X: rbf_kernel(X, gamma=gamma), num_components)

# Perform classification using the minimum distance classifier
predictions_pca = myclassifier(train_data_pca, train_labels, test_data_pca)
predictions_kpca_rbf = myclassifier(train_data_kpca_rbf, train_labels, test_data_kpca_rbf)

# Calculate accuracy
accuracy_pca = calculate_accuracy(test_labels, predictions_pca)
accuracy_kpca_rbf = calculate_accuracy(test_labels, predictions_kpca_rbf)

print(f'PCA Accuracy: {accuracy_pca}')
print(f'KPCA (RBF Kernel) Accuracy: {accuracy_kpca_rbf}')

KeyError: 148