# Comparing Our PCA with sklearn

## Introduction

Let's verify our implementation by comparing it side-by-side with sklearn.

### What You'll Learn
1. How to compare two PCA implementations
2. Verify numerical equivalence
3. Performance comparison
4. When to use each implementation

### Testing Strategy
- Use same data for both
- Compare all outputs
- Time performance
- Verify mathematical equivalence

In [None]:
# Import libraries
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.decomposition import PCA as SklearnPCA
from sklearn.datasets import load_iris, make_classification

# Import our custom PCA
sys.path.append('../2_from_scratch')
from pca_implementation import PCA as CustomPCA

plt.style.use('seaborn-v0_8-darkgrid')
np.set_printoptions(precision=4, suppress=True)

print('✓ Libraries imported')

## 1. Basic Comparison on Simple Data

In [None]:
# Test data
np.random.seed(42)
X = np.array([
    [2.5, 2.4], [0.5, 0.7], [2.2, 2.9], [1.9, 2.2],
    [3.1, 3.0], [2.3, 2.7], [2.0, 1.6], [1.0, 1.1]
])

# Our implementation
custom_pca = CustomPCA(n_components=2)
X_custom = custom_pca.fit_transform(X)

# sklearn implementation  
sklearn_pca = SklearnPCA(n_components=2)
X_sklearn = sklearn_pca.fit_transform(X)

print('Comparison Results:\n')
print('=' * 60)
print('\n1. Mean values match:', np.allclose(custom_pca.mean_, sklearn_pca.mean_))
print('   Custom:', custom_pca.mean_)
print('   Sklearn:', sklearn_pca.mean_)

print('\n2. Explained variance match:', 
      np.allclose(custom_pca.explained_variance_, sklearn_pca.explained_variance_))
print('   Custom:', custom_pca.explained_variance_)
print('   Sklearn:', sklearn_pca.explained_variance_)

print('\n3. Explained variance ratio match:',
      np.allclose(custom_pca.explained_variance_ratio_, sklearn_pca.explained_variance_ratio_))
print('   Custom:', custom_pca.explained_variance_ratio_)
print('   Sklearn:', sklearn_pca.explained_variance_ratio_)

# Components might have opposite signs (both valid)
components_match = np.allclose(np.abs(custom_pca.components_), np.abs(sklearn_pca.components_))
print('\n4. Components match (absolute values):', components_match)

# Transformed data might differ in sign
transform_match = np.allclose(np.abs(X_custom), np.abs(X_sklearn))
print('\n5. Transformed data match (absolute values):', transform_match)

if all([components_match, transform_match]):
    print('\n✓ SUCCESS: Implementations produce equivalent results!')
    print('  (Sign differences are mathematically valid)')
else:
    print('\n✗ WARNING: Check for implementation differences')

## 2. Visual Comparison

In [None]:
# Visualize both transformations
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Custom PCA
ax1.scatter(X_custom[:, 0], X_custom[:, 1], s=100, alpha=0.7,
           edgecolors='k', linewidths=2, color='blue')
ax1.axhline(0, color='gray', linestyle='--', alpha=0.5)
ax1.axvline(0, color='gray', linestyle='--', alpha=0.5)
ax1.set_xlabel('PC1', fontsize=12)
ax1.set_ylabel('PC2', fontsize=12)
ax1.set_title('Custom PCA Implementation', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)

# sklearn PCA
ax2.scatter(X_sklearn[:, 0], X_sklearn[:, 1], s=100, alpha=0.7,
           edgecolors='k', linewidths=2, color='red')
ax2.axhline(0, color='gray', linestyle='--', alpha=0.5)
ax2.axvline(0, color='gray', linestyle='--', alpha=0.5)
ax2.set_xlabel('PC1', fontsize=12)
ax2.set_ylabel('PC2', fontsize=12)
ax2.set_title('sklearn PCA', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print('💡 Plots may be mirrored (sign flip) but structure is identical')

## 3. Performance Comparison

In [None]:
# Generate larger dataset
X_large, _ = make_classification(n_samples=1000, n_features=20, 
                                 n_informative=15, random_state=42)

print('Performance Test on Larger Dataset:')
print(f'Samples: {X_large.shape[0]}, Features: {X_large.shape[1]}')
print('\n' + '=' * 60)

# Time custom PCA
start = time.time()
custom_pca_large = CustomPCA(n_components=10)
X_custom_large = custom_pca_large.fit_transform(X_large)
custom_time = time.time() - start

# Time sklearn PCA
start = time.time()
sklearn_pca_large = SklearnPCA(n_components=10)
X_sklearn_large = sklearn_pca_large.fit_transform(X_large)
sklearn_time = time.time() - start

print(f'\nCustom PCA time: {custom_time:.4f} seconds')
print(f'sklearn PCA time: {sklearn_time:.4f} seconds')
print(f'Speedup factor: {custom_time/sklearn_time:.2f}x')

print('\n💡 sklearn is typically faster (optimized C/Fortran code)')
print('   But our implementation is great for learning!')

## 4. Iris Dataset Comparison

In [None]:
# Load and compare on Iris
from sklearn.preprocessing import StandardScaler

iris = load_iris()
X_iris = StandardScaler().fit_transform(iris.data)
y_iris = iris.target

# Both implementations
custom_iris = CustomPCA(n_components=2).fit_transform(X_iris)
sklearn_iris = SklearnPCA(n_components=2).fit_transform(X_iris)

# Plot side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
colors = ['red', 'blue', 'green']

for i, color in enumerate(colors):
    ax1.scatter(custom_iris[y_iris==i, 0], custom_iris[y_iris==i, 1],
               color=color, alpha=0.6, s=50, edgecolors='k', linewidths=0.5,
               label=iris.target_names[i])
    ax2.scatter(sklearn_iris[y_iris==i, 0], sklearn_iris[y_iris==i, 1],
               color=color, alpha=0.6, s=50, edgecolors='k', linewidths=0.5,
               label=iris.target_names[i])

ax1.set_title('Custom PCA on Iris', fontsize=14, fontweight='bold')
ax1.set_xlabel('PC1', fontsize=12)
ax1.set_ylabel('PC2', fontsize=12)
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.set_title('sklearn PCA on Iris', fontsize=14, fontweight='bold')
ax2.set_xlabel('PC1', fontsize=12)
ax2.set_ylabel('PC2', fontsize=12)
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print('✓ Both implementations separate the classes identically')

## Key Takeaways

### Verification Results

1. **Mathematically equivalent**: Same variance, components, transformations
2. **Sign ambiguity**: Eigenvectors can flip sign (both valid)
3. **Performance**: sklearn is faster (optimized), but both work
4. **Understanding**: Our implementation helps understand internals

### When to Use Each

**Use Custom PCA when:**
- Learning PCA concepts
- Need to modify algorithm
- Educational purposes
- Small datasets

**Use sklearn PCA when:**
- Production code
- Large datasets
- Need speed
- Integration with sklearn pipelines

### Next Steps

Now we'll apply PCA to real agricultural soil data!

---

**Excellent!** Both implementations work identically.

Continue to: `../4_agricultural_application/soil_data_exploration.ipynb`