# Africa Soil Property Prediction - Example Notebook

This notebook demonstrates how to use the soil property prediction package to train models and make predictions.

## 1. Import Libraries

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append('../src')

from src.data_preprocessing import SoilDataPreprocessor, handle_missing_values
from src.models import MultiTargetSoilPredictor, EnsemblePredictor
from src.feature_engineering import SpectralFeatureEngineer

%matplotlib inline
sns.set_style('whitegrid')

## 2. Load and Explore Data

In [None]:
# Initialize preprocessor
preprocessor = SoilDataPreprocessor()

# Load training data
train_df = preprocessor.load_data('../data/train.csv')
print(f"Training data shape: {train_df.shape}")
print(f"\nFirst few rows:")
train_df.head()

In [None]:
# Check target distributions
target_cols = ['Ca', 'P', 'pH', 'SOC', 'Sand']

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for i, col in enumerate(target_cols):
    if col in train_df.columns:
        axes[i].hist(train_df[col], bins=30, edgecolor='black')
        axes[i].set_title(f'{col} Distribution')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 3. Preprocess Data

In [None]:
# Handle missing values
train_df = handle_missing_values(train_df, strategy='mean')

# Prepare features and targets
X = preprocessor.prepare_features(train_df, fit=True)
y = preprocessor.prepare_targets(train_df)

print(f"Feature matrix shape: {X.shape}")
print(f"Target matrix shape: {y.shape}")

In [None]:
# Split data
X_train, X_val, y_train, y_val = preprocessor.split_data(X, y, test_size=0.2)

print(f"Training set: X={X_train.shape}, y={y_train.shape}")
print(f"Validation set: X={X_val.shape}, y={y_val.shape}")

## 4. Train Models

### 4.1 Random Forest

In [None]:
# Train Random Forest
rf_model = MultiTargetSoilPredictor(
    model_type='random_forest',
    n_estimators=100,
    max_depth=None
)

rf_model.train(X_train, y_train)

# Evaluate
rf_metrics = rf_model.evaluate(X_val, y_val)
rf_model.print_evaluation(rf_metrics)

### 4.2 XGBoost

In [None]:
# Train XGBoost
xgb_model = MultiTargetSoilPredictor(
    model_type='xgboost',
    n_estimators=100,
    max_depth=6
)

xgb_model.train(X_train, y_train)

# Evaluate
xgb_metrics = xgb_model.evaluate(X_val, y_val)
xgb_model.print_evaluation(xgb_metrics)

### 4.3 Compare Models

In [None]:
# Compare R2 scores
models = ['Random Forest', 'XGBoost']
r2_scores = [
    rf_metrics['overall']['R2'],
    xgb_metrics['overall']['R2']
]

plt.figure(figsize=(10, 6))
plt.bar(models, r2_scores)
plt.ylabel('R² Score')
plt.title('Model Comparison - Overall R² Score')
plt.ylim([0, 1])
plt.show()

## 5. Visualize Predictions

In [None]:
# Get predictions
y_pred_rf = rf_model.predict(X_val)

# Plot actual vs predicted for each target
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, target in enumerate(target_cols):
    if i < y_val.shape[1]:
        axes[i].scatter(y_val[:, i], y_pred_rf[:, i], alpha=0.5)
        axes[i].plot([y_val[:, i].min(), y_val[:, i].max()], 
                     [y_val[:, i].min(), y_val[:, i].max()], 
                     'r--', lw=2)
        axes[i].set_xlabel(f'Actual {target}')
        axes[i].set_ylabel(f'Predicted {target}')
        axes[i].set_title(f'{target} - Actual vs Predicted')
        
        # Add R² score
        r2 = rf_metrics['per_target'][target]['R2']
        axes[i].text(0.05, 0.95, f'R² = {r2:.3f}', 
                    transform=axes[i].transAxes, 
                    verticalalignment='top')

plt.tight_layout()
plt.show()

## 6. Feature Engineering with PCA

In [None]:
# Apply PCA
feature_engineer = SpectralFeatureEngineer()
X_pca = feature_engineer.apply_pca(X, n_components=50, fit=True)

print(f"Original shape: {X.shape}")
print(f"PCA shape: {X_pca.shape}")

# Plot explained variance
variance_ratio = feature_engineer.get_explained_variance_ratio()
cumulative_variance = feature_engineer.get_cumulative_variance()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.bar(range(len(variance_ratio)), variance_ratio)
ax1.set_xlabel('Principal Component')
ax1.set_ylabel('Explained Variance Ratio')
ax1.set_title('Explained Variance by Component')

ax2.plot(range(len(cumulative_variance)), cumulative_variance)
ax2.set_xlabel('Number of Components')
ax2.set_ylabel('Cumulative Explained Variance')
ax2.set_title('Cumulative Explained Variance')
ax2.axhline(y=0.9, color='r', linestyle='--', label='90% variance')
ax2.legend()

plt.tight_layout()
plt.show()

print(f"\nCumulative variance explained: {cumulative_variance[-1]:.4f}")

## 7. Save Model

In [None]:
# Save the best model
import joblib

os.makedirs('../models', exist_ok=True)

rf_model.save_model('../models/random_forest_model.pkl')
joblib.dump(preprocessor, '../models/preprocessor.pkl')

print("Model and preprocessor saved successfully!")

## 8. Summary

In this notebook, we:
1. Loaded and explored the soil spectral data
2. Preprocessed features and targets
3. Trained multiple models (Random Forest, XGBoost)
4. Evaluated model performance
5. Visualized predictions vs actual values
6. Applied PCA for dimensionality reduction
7. Saved the trained model

The models successfully predict five soil properties from infrared spectral data.