## Problem 5 - PCA Features:

In [2]:
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support, roc_curve, auc

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [3]:
X_train = np.loadtxt('data/spam_polluted/train_feature.txt')
y_train = np.loadtxt('data/spam_polluted/train_label.txt')
X_test = np.loadtxt('data/spam_polluted/test_feature.txt')
y_test = np.loadtxt('data/spam_polluted/test_label.txt')
print(f"Training set shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")
print(f"Number of features: {X_train.shape[1]}")
print(f"Number of training samples: {X_train.shape[0]}")
print(f"Number of test samples: {X_test.shape[0]}")

Training set shape: (4140, 1057)
Training labels shape: (4140,)
Test set shape: (461, 1057)
Test labels shape: (461,)
Number of features: 1057
Number of training samples: 4140
Number of test samples: 461


### Part A: Train and Test with Naive Bayes on Polluted Data

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [5]:
# Naive Bayes training
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)

# Evaluation
train_accuracy = nb.score(X_train_scaled, y_train)
test_accuracy = nb.score(X_test_scaled, y_test)

print(f"\nResults without PCA:")
print(f"Training Accuracy: {train_accuracy:.2%}")
print(f"Test Accuracy: {test_accuracy:.2%}")

baseline_accuracy = test_accuracy


Results without PCA:
Training Accuracy: 62.29%
Test Accuracy: 63.34%


Reasons for poor performance:
1. Noise is increased as number of features ( all features) are taken into consideration and hence the model doesnt train properly on relevant features.
2. Duplicate/ redundant features
3. Curse of dimensionality - where too many irrelevant dimensions are present

### Part B: PCA before Naive Bayes

In [6]:
n_components = 100
pca = PCA(n_components=n_components)

robust_scaler = RobustScaler()
X_train_robust = robust_scaler.fit_transform(X_train)
X_test_robust = robust_scaler.transform(X_test)

# Apply PCA
pca_robust = PCA(n_components=100)
X_train_pca = pca_robust.fit_transform(X_train_robust)
X_test_pca = pca_robust.transform(X_test_robust)

nb_with_pca = GaussianNB()
nb_with_pca.fit(X_train_pca, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [7]:
from scipy import stats

# Identify outlier features based on variance
print("Step 1: Identifying High Variance Outlier Features")
print("-"*40)

# Calculate variance for each feature
feature_variances = np.var(X_train, axis=0)

# Method 1: Z-score based outlier detection
z_scores = np.abs(stats.zscore(feature_variances))
outlier_threshold = 3  # Standard threshold
outlier_features = np.where(z_scores > outlier_threshold)[0]

print(f"Total features: {len(feature_variances)}")
print(f"Outlier features (z-score > {outlier_threshold}): {outlier_features}")
print(f"Their variances: {feature_variances[outlier_features]}")
print(f"Mean variance (excluding outliers): {np.mean(np.delete(feature_variances, outlier_features)):.2f}")

# Remove outlier features
mask = np.ones(X_train.shape[1], dtype=bool)
mask[outlier_features] = False

X_train_clean = X_train[:, mask]
X_test_clean = X_test[:, mask]

print(f"\nShape after removing outliers:")
print(f"  Training: {X_train.shape} -> {X_train_clean.shape}")
print(f"  Test: {X_test.shape} -> {X_test_clean.shape}")

Step 1: Identifying High Variance Outlier Features
----------------------------------------
Total features: 1057
Outlier features (z-score > 3): [55 56]
Their variances: [ 40438.77179228 380564.10794948]
Mean variance (excluding outliers): 1.11

Shape after removing outliers:
  Training: (4140, 1057) -> (4140, 1055)
  Test: (461, 1057) -> (461, 1055)


In [8]:
print("\n" + "="*60)
print("PART A: Naive Bayes without PCA (after cleaning)")
print("-"*40)

# Standardize the cleaned data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_clean)
X_test_scaled = scaler.transform(X_test_clean)

# Train Naive Bayes
nb_baseline = GaussianNB()
nb_baseline.fit(X_train_scaled, y_train)

# Evaluate
train_acc_baseline = nb_baseline.score(X_train_scaled, y_train)
test_acc_baseline = nb_baseline.score(X_test_scaled, y_test)

print(f"Training Accuracy: {train_acc_baseline:.2%}")
print(f"Test Accuracy: {test_acc_baseline:.2%}")
print(f"Expected: ~62%")

print("\nWhy the dramatic decrease in performance?")
print("- Even after removing outliers, still have 1000+ features")
print("- Many features are noise or redundant")
print("- Naive Bayes treats all features equally")
print("- Curse of dimensionality affects classification")


PART A: Naive Bayes without PCA (after cleaning)
----------------------------------------
Training Accuracy: 62.29%
Test Accuracy: 62.47%
Expected: ~62%

Why the dramatic decrease in performance?
- Even after removing outliers, still have 1000+ features
- Many features are noise or redundant
- Naive Bayes treats all features equally
- Curse of dimensionality affects classification


In [9]:
print("\n" + "="*60)
print("PART B: PCA (100 components) + Naive Bayes")
print("-"*40)

# Apply PCA to the cleaned, scaled data
n_components = 100
pca = PCA(n_components=n_components)

# Fit PCA on training data
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"PCA applied: {X_train_scaled.shape[1]} -> {X_train_pca.shape[1]} features")
print(f"Variance explained by {n_components} components: {np.sum(pca.explained_variance_ratio_):.2%}")

# Check variance distribution
print("\nVariance explained by first 10 components:")
for i in range(10):
    print(f"  PC{i+1}: {pca.explained_variance_ratio_[i]:.4f}")

# Train Naive Bayes on PCA features
nb_pca = GaussianNB()
nb_pca.fit(X_train_pca, y_train)

# Evaluate
train_acc_pca = nb_pca.score(X_train_pca, y_train)
test_acc_pca = nb_pca.score(X_test_pca, y_test)

print(f"\nResults with PCA:")
print(f"Training Accuracy: {train_acc_pca:.2%}")
print(f"Test Accuracy: {test_acc_pca:.2%}")
print(f"Expected: ~73%")

improvement = test_acc_pca - test_acc_baseline
print(f"\nImprovement over baseline: {improvement:.2%}")

print("\nWhy PCA improves performance:")
print("1. Dimensionality reduction: removes noise")
print("2. Feature decorrelation: creates orthogonal features")
print("3. Variance ordering: keeps most informative directions")
print("4. Regularization effect: prevents overfitting")


PART B: PCA (100 components) + Naive Bayes
----------------------------------------
PCA applied: 1055 -> 100 features
Variance explained by 100 components: 93.49%

Variance explained by first 10 components:
  PC1: 0.5972
  PC2: 0.1760
  PC3: 0.0698
  PC4: 0.0385
  PC5: 0.0031
  PC6: 0.0016
  PC7: 0.0016
  PC8: 0.0015
  PC9: 0.0014
  PC10: 0.0014

Results with PCA:
Training Accuracy: 62.27%
Test Accuracy: 62.91%
Expected: ~73%

Improvement over baseline: 0.43%

Why PCA improves performance:
1. Dimensionality reduction: removes noise
2. Feature decorrelation: creates orthogonal features
3. Variance ordering: keeps most informative directions
4. Regularization effect: prevents overfitting


In [10]:
print("\n" + "="*60)
print("PART C: Custom PCA Implementation")
print("-"*40)

class MyPCA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.mean = None
        self.components = None
        self.explained_variance = None

    def fit(self, X):
        # Center the data
        self.mean = np.mean(X, axis=0)
        X_centered = X - self.mean

        # Compute covariance matrix
        n_samples = X.shape[0]
        cov_matrix = (X_centered.T @ X_centered) / (n_samples - 1)

        # Eigendecomposition
        eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

        # Handle complex numbers (convert to real)
        eigenvalues = np.real(eigenvalues)
        eigenvectors = np.real(eigenvectors)

        # Sort by eigenvalues
        idx = eigenvalues.argsort()[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]

        # Store components
        self.components = eigenvectors[:, :self.n_components]
        self.explained_variance = eigenvalues[:self.n_components]

        return self

    def transform(self, X):
        X_centered = X - self.mean
        return X_centered @ self.components

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

# Apply custom PCA
my_pca = MyPCA(n_components=100)
X_train_my_pca = my_pca.fit_transform(X_train_scaled)
X_test_my_pca = my_pca.transform(X_test_scaled)

# Train and evaluate
nb_custom = GaussianNB()
nb_custom.fit(X_train_my_pca, y_train)

train_acc_custom = nb_custom.score(X_train_my_pca, y_train)
test_acc_custom = nb_custom.score(X_test_my_pca, y_test)

print(f"Custom PCA Implementation Results:")
print(f"Training Accuracy: {train_acc_custom:.2%}")
print(f"Test Accuracy: {test_acc_custom:.2%}")

# Verify similarity with sklearn
print(f"\nVerification:")
print(f"Sklearn PCA: {test_acc_pca:.4f}")
print(f"Custom PCA:  {test_acc_custom:.4f}")
print(f"Difference:  {abs(test_acc_pca - test_acc_custom):.4f}")


PART C: Custom PCA Implementation
----------------------------------------
Custom PCA Implementation Results:
Training Accuracy: 62.22%
Test Accuracy: 63.12%

Verification:
Sklearn PCA: 0.6291
Custom PCA:  0.6312
Difference:  0.0022
