In [2]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Ellipse

plt.style.use('seaborn-v0_8-whitegrid')
np.random.seed(42)



## 1. Understanding Covariance

While variance measures how ONE feature varies, **covariance** measures how TWO features vary **together**.

**Formula**:
$$\text{Cov}(X, Y) = \frac{1}{n}\sum_{i=1}^{n}(x_i - \bar{x})(y_i - \bar{y})$$


**Interpretation**:
- **Positive covariance**: When X increases, Y tends to increase
- **Negative covariance**: When X increases, Y tends to decrease  
- **Zero covariance**: X and Y are uncorrelated

### Agricultural Example
In soil analysis:
- Nitrogen and organic matter usually have **positive covariance**
- Sand % and clay % usually have **negative covariance**
- pH and iron might have **zero/low covariance**


In [None]:
# Example 1: Positive covariance
n_samples = 100
nitrogen = np.random.normal(50, 15, n_samples)
organic_matter = 0.8 * nitrogen + np.random.normal(0, 5, n_samples)

cov_pos = np.cov(nitrogen, organic_matter)[0, 1]
print(f"Nitrogen vs Organic Matter")
print(f"Covariance: {cov_pos:.2f} (POSITIVE)\n")

# Example 2: Negative covariance
sand = np.random.uniform(20, 70, n_samples)
clay = 90 - sand + np.random.normal(0, 5, n_samples)

cov_neg = np.cov(sand, clay)[0, 1]
print(f"Sand % vs Clay %")
print(f"Covariance: {cov_neg:.2f} (NEGATIVE)\n")

# Example 3: Near-zero covariance
ph = np.random.normal(7, 0.5, n_samples)
random_feature = np.random.normal(50, 10, n_samples)

cov_zero = np.cov(ph, random_feature)[0, 1]
print(f"pH vs Random Feature")
print(f"Covariance: {cov_zero:.2f} (NEAR ZERO)")


In [None]:
# Visualize covariance types
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

axes[0].scatter(nitrogen, organic_matter, alpha=0.6)
axes[0].set_title(f'Positive Covariance\nCov={cov_pos:.2f}')

axes[1].scatter(sand, clay, alpha=0.6, color='orange')
axes[1].set_title(f'Negative Covariance\nCov={cov_neg:.2f}')

axes[2].scatter(ph, random_feature, alpha=0.6, color='green')
axes[2].set_title(f'Zero Covariance\nCov={cov_zero:.2f}')

plt.tight_layout()
plt.show()



## 2. Covariance Matrix
The covariance matrix organizes pairwise covariances between features.


In [None]:
np.random.seed(42)
n = 150

nitrogen = np.random.normal(45, 12, n)
phosphorus = 0.7 * nitrogen + np.random.normal(0, 8, n)
ph = np.random.normal(6.8, 0.4, n)
organic_matter = 0.6 * nitrogen + np.random.normal(0, 3, n)

soil_data = np.column_stack([nitrogen, phosphorus, ph, organic_matter])
feature_names = ['Nitrogen', 'Phosphorus', 'pH', 'Organic Matter']

soil_centered = soil_data - soil_data.mean(axis=0)
cov_matrix = np.cov(soil_centered.T)

print('Covariance Matrix:')
print(cov_matrix)


In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(cov_matrix, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
            xticklabels=feature_names, yticklabels=feature_names)
plt.title('Soil Feature Covariance Matrix')
plt.show()



## 3. Eigenvectors and Eigenvalues
Eigenvectors define directions, eigenvalues define magnitude.


In [None]:
A = np.array([[3, 1], [1, 3]])
eigenvalues, eigenvectors = np.linalg.eig(A)

print('Eigenvalues:', eigenvalues)
print('Eigenvectors:\n', eigenvectors)



## 4. PCA Algorithm
1. Center data  
2. Compute covariance matrix  
3. Eigen decomposition  
4. Sort by eigenvalues  
5. Project data  


In [None]:
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
idx = eigenvalues.argsort()[::-1]
eigenvalues_sorted = eigenvalues[idx]
eigenvectors_sorted = eigenvectors[:, idx]

data_pca = soil_centered @ eigenvectors_sorted[:, :2]

print('Reduced shape:', data_pca.shape)


In [None]:
plt.scatter(data_pca[:,0], data_pca[:,1], alpha=0.6)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA Projection (2D)')
plt.show()



## 5. Key Takeaways
- Covariance explains relationships  
- Eigenvectors give directions  
- PCA reduces dimensionality with minimal information loss  
