#Exercise 1: Visualizing Decision Boundaries
Goal: Show how individual trees and the full random forest behave.

In [None]:
# Import the Scikit-Learn lib (SKLearn), it is built in to Colab already so no need to pip install.
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay

In [None]:
# Create synthetic dataset
X, y = make_classification(n_features=2, n_informative=2, n_redundant=0,
                           n_clusters_per_class=1, n_samples=500, random_state=10)

In [None]:
# Train a single decision tree
tree = DecisionTreeClassifier(max_depth=3).fit(X, y)

In [None]:
# Train a random forest
forest = RandomForestClassifier(n_estimators=100, max_depth=3).fit(X, y)

In [None]:
# Plot decision boundaries
_, ax = plt.subplots(1, 2, figsize=(12, 5))
DecisionBoundaryDisplay.from_estimator(tree, X, response_method='predict', ax=ax[0], cmap="coolwarm")
ax[0].scatter(X[:, 0], X[:, 1], c=y, edgecolor='k')
ax[0].set_title("Single Tree")

DecisionBoundaryDisplay.from_estimator(forest, X, response_method='predict', ax=ax[1], cmap="coolwarm")
ax[1].scatter(X[:, 0], X[:, 1], c=y, edgecolor='k')
ax[1].set_title("Random Forest")

**Task**

Change max_depth, n_estimators and add noise features to see the impact of these changes.

**Questions**

Pair up and answer the following questions together:
*   How 'good' is the intial classification? What are the fail states?
*   What is the optimum setting for both approaches?


________________________________________________________________________________

 # Exercise 2: Tree vs Forest Accuracy with Real Data
 Goal: Show accuracy difference and robustness of Random Forests.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_wine

# Load real data
X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

tree = DecisionTreeClassifier().fit(X_train, y_train)
forest = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

print("Tree accuracy:", accuracy_score(y_test, tree.predict(X_test)))
print("Forest accuracy:", accuracy_score(y_test, forest.predict(X_test)))

**Task**

Compare performance across datasets (load_iris, load_digits, etc.):

*   https://scikit-learn.org/stable/datasets/toy_dataset.html

Test the addition and removal of features and check the sensitivity of our classifiers to this.


**Questions**

Pair up and answer the following question together:
*   What is the average perforamnce gain (expressed as accuracy) differential between the tree and the forest for the wine and the iris datasets?

________________________________________________________________________________

#Exercise 3: Apply to Remote Sensing-Like Data
Goal: Use a simplified "satellite" dataset with spectral bands and labels.

In [None]:
import pandas as pd
import numpy as np

# Simulated remote sensing dataset
np.random.seed(42)
n_samples = 500
bands = ['blue', 'green', 'red', 'nir']
X = np.random.rand(n_samples, len(bands))
y = (X[:, 2] > 0.5).astype(int)  # Let's say 'vegetation (1) vs not (0)' based on red

rf = RandomForestClassifier().fit(X, y)
print("Accuracy:", accuracy_score(y, rf.predict(X)))

importances = rf.feature_importances_
pd.Series(importances, index=bands).plot(kind='bar', title='Band Importance')


In [None]:
import seaborn as sns

# Combine into DataFrame for inspection
df = pd.DataFrame(X, columns=bands)
df['class'] = y

# Visualize the distributions and verify that 'red' is our spectrally significantly different band
sns.pairplot(df, hue='class', plot_kws={'alpha': 0.6, 's': 20})
plt.suptitle("Synthetic Spectral Data by Class", y=1.02)
plt.show()

In [None]:
# Simulated Pseudo-Image from Synthetic Data

# Assume the synthetic data: X shape (n_samples, 4), bands = ['blue', 'green', 'red', 'nir']
# Let's reshape it into a 2D grid
n_rows, n_cols = 25, 20  # 500 samples -> 25 x 20 grid
assert n_rows * n_cols == X.shape[0]

# Reshape each band
blue = X[:, 0].reshape(n_rows, n_cols)
green = X[:, 1].reshape(n_rows, n_cols)
red = X[:, 2].reshape(n_rows, n_cols)

# Stack into an RGB image
rgb = np.stack([red, green, blue], axis=-1)

# Optional: stretch to [0,1] for display (though values are already 0–1 in synthetic data)
rgb = np.clip(rgb, 0, 1)

# Display
plt.figure(figsize=(6, 6))
plt.imshow(rgb)
plt.title("Simulated Remote Sensing RGB Composite")
plt.axis('off')
plt.show()


**Task**

Add 'fake' noise bands and observe how the model performance changes. Then adjust the logic of how labels are generated and check classifier limits.

**Question**

What happens to the accuracy when we add a band that deliberately gives the 'opposite' signal to the 'red' band that we set up in our first run of this?

________________________________________________________________________________

# Exercise 4: Putting It All Together with SVM

Goal: Apply classification, feature importance intuition, and decision boundary visualization using an SVM instead of a Random Forest.

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.inspection import DecisionBoundaryDisplay

# Use same synthetic data from Exercise 3
X = df[bands].values
y = df['class'].values

# Create an SVM pipeline with feature scaling
svm = make_pipeline(StandardScaler(), SVC(kernel='rbf', C=1.0, gamma='scale'))
svm.fit(X, y)

# Evaluate performance
print("SVM Accuracy:", accuracy_score(y, svm.predict(X)))

# Try cross-validation
cv_score = cross_val_score(svm, X, y, cv=5).mean()
print("Cross-validated accuracy:", round(cv_score, 3))


**Tasks**
*   Try different kernels: 'linear', 'poly', 'rbf', 'sigmoid'.
*   Adjust C and gamma to observe under/overfitting.
*   Compare how SVM performs vs. Random Forest on the same data.
*   Replace synthetic data with a real-world dataset (e.g. load_iris())

**Question**
What is the most accurate kernel and C value to use for your synthetic remote sensing dataset?


# Optional Visualization: Project to 2D with PCA and Plot Decision Boundary

In [None]:
from sklearn.decomposition import PCA

# Project to 2D using PCA for visualization
pca = PCA(n_components=2)
X_2D = pca.fit_transform(X)

# Train SVM on 2D data
svm_2d = make_pipeline(StandardScaler(), SVC(kernel='rbf', C=1.0, gamma='scale'))
svm_2d.fit(X_2D, y)

# Plot decision boundary
DecisionBoundaryDisplay.from_estimator(svm_2d, X_2D, response_method="predict", cmap="coolwarm")
plt.scatter(X_2D[:, 0], X_2D[:, 1], c=y, edgecolor='k', cmap="coolwarm", alpha=0.6)
plt.title("SVM Decision Boundary (PCA-Reduced Data)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()


# Optional Structured Synthetic Data Generation for Remote Sensing Classification

Try taking this alternative synethetic data generation code and inserting back up the notebook where you generate your test 'remote sensing' data. What difference does this make to how the methods perform?

In [None]:
import numpy as np
import pandas as pd

# Set grid dimensions
n_rows, n_cols = 25, 20
n_samples = n_rows * n_cols
bands = ['blue', 'green', 'red', 'nir']

# Create coordinate grid
xx, yy = np.meshgrid(np.arange(n_cols), np.arange(n_rows))
x_coords = xx.flatten()
y_coords = yy.flatten()

# Simulate structured class label:
# Vegetation occurs in a central elliptical region
x_center, y_center = n_cols / 2, n_rows / 2
ellipse_mask = ((x_coords - x_center)**2 / (0.3 * n_cols)**2 + (y_coords - y_center)**2 / (0.4 * n_rows)**2) < 1
y = ellipse_mask.astype(int)  # 1 = vegetation, 0 = other

# Generate band values
X = np.zeros((n_samples, 4))

# Non-vegetation class (class 0): lower NIR, higher red
X[y == 0, 0] = np.random.normal(0.3, 0.1, size=(y == 0).sum())  # blue
X[y == 0, 1] = np.random.normal(0.4, 0.1, size=(y == 0).sum())  # green
X[y == 0, 2] = np.random.normal(0.6, 0.1, size=(y == 0).sum())  # red
X[y == 0, 3] = np.random.normal(0.2, 0.05, size=(y == 0).sum()) # nir

# Vegetation class (class 1): lower red, higher NIR
X[y == 1, 0] = np.random.normal(0.2, 0.05, size=(y == 1).sum()) # blue
X[y == 1, 1] = np.random.normal(0.4, 0.1, size=(y == 1).sum())  # green
X[y == 1, 2] = np.random.normal(0.3, 0.05, size=(y == 1).sum()) # red
X[y == 1, 3] = np.random.normal(0.7, 0.1, size=(y == 1).sum())  # nir

# Clip all values to [0, 1]
X = np.clip(X, 0, 1)

# Create DataFrame for exploration
df = pd.DataFrame(X, columns=bands)
df['class'] = y
df['x'] = x_coords
df['y'] = y_coords
