# Simple Baseline Model

Building a straightforward baseline using only the tabular features (NDVI, Height, etc.) without images or complex feature engineering.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
np.random.seed(42)

## 1. Load and Prepare Data

In [None]:
# Load training data
train_df = pd.read_csv('competition/train.csv')

# Convert to wide format (one row per image)
train_wide = train_df.pivot_table(
    index=['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm'],
    columns='target_name',
    values='target'
).reset_index()

print(f"Training samples: {len(train_wide)}")
train_wide.head()

## 2. Feature Engineering (Minimal)

Just extract basic date features and encode categorical variables.

In [None]:
# Extract date features
train_wide['Sampling_Date'] = pd.to_datetime(train_wide['Sampling_Date'])
train_wide['Month'] = train_wide['Sampling_Date'].dt.month
train_wide['Day_of_Year'] = train_wide['Sampling_Date'].dt.dayofyear

# Encode categorical variables with one-hot encoding
train_encoded = pd.get_dummies(train_wide, columns=['State'], prefix='State')

# For Species, let's just use the top 10 most common and group the rest as 'Other'
top_species = train_wide['Species'].value_counts().head(10).index
train_encoded['Species_Group'] = train_wide['Species'].apply(
    lambda x: x if x in top_species else 'Other'
)
train_encoded = pd.get_dummies(train_encoded, columns=['Species_Group'], prefix='Species')

print(f"Features after encoding: {train_encoded.shape[1]} columns")

## 3. Define Features and Targets

In [None]:
# Target columns
target_cols = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']

# Feature columns (exclude image_path, original date, species, and targets)
exclude_cols = ['image_path', 'Sampling_Date', 'Species'] + target_cols
feature_cols = [col for col in train_encoded.columns if col not in exclude_cols]

print(f"Number of features: {len(feature_cols)}")
print(f"\nFeatures: {feature_cols}")
print(f"\nTargets: {target_cols}")

In [None]:
# Prepare X and y
X = train_encoded[feature_cols]
y = train_encoded[target_cols]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nAny missing values in X: {X.isnull().sum().sum()}")
print(f"Any missing values in y: {y.isnull().sum().sum()}")

## 4. Train/Validation Split

In [None]:
# Split data (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")

## 5. Baseline Model 1: Random Forest (Multi-Output)

Train a single Random Forest that predicts all 5 targets simultaneously.

In [None]:
# Train Random Forest
print("Training Random Forest...")
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Wrap in MultiOutputRegressor
multi_rf = MultiOutputRegressor(rf_model)
multi_rf.fit(X_train, y_train)

print("Training complete!")

## 6. Evaluate Performance

In [None]:
# Make predictions
y_train_pred = multi_rf.predict(X_train)
y_val_pred = multi_rf.predict(X_val)

# Convert to DataFrames for easier handling
y_train_pred_df = pd.DataFrame(y_train_pred, columns=target_cols, index=y_train.index)
y_val_pred_df = pd.DataFrame(y_val_pred, columns=target_cols, index=y_val.index)

In [None]:
# Calculate R² for each target
def calculate_metrics(y_true, y_pred, set_name=''):
    print(f"\n{'='*60}")
    print(f"{set_name} Performance")
    print(f"{'='*60}")
    
    r2_scores = {}
    for col in target_cols:
        r2 = r2_score(y_true[col], y_pred[col])
        mae = mean_absolute_error(y_true[col], y_pred[col])
        rmse = np.sqrt(mean_squared_error(y_true[col], y_pred[col]))
        r2_scores[col] = r2
        
        print(f"\n{col}:")
        print(f"  R² Score: {r2:.4f}")
        print(f"  MAE: {mae:.2f}g")
        print(f"  RMSE: {rmse:.2f}g")
    
    # Calculate weighted competition score
    weights = {
        'Dry_Green_g': 0.1,
        'Dry_Dead_g': 0.1,
        'Dry_Clover_g': 0.1,
        'GDM_g': 0.2,
        'Dry_Total_g': 0.5
    }
    
    competition_score = sum(weights[col] * r2_scores[col] for col in target_cols)
    
    print(f"\n{'='*60}")
    print(f"Competition Score (Weighted R²): {competition_score:.4f}")
    print(f"{'='*60}")
    
    return r2_scores, competition_score

# Evaluate on training set
train_r2, train_score = calculate_metrics(y_train, y_train_pred_df, "TRAINING SET")

# Evaluate on validation set
val_r2, val_score = calculate_metrics(y_val, y_val_pred_df, "VALIDATION SET")

## 7. Visualize Predictions vs Actuals

In [None]:
# Plot predictions vs actuals for validation set
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, col in enumerate(target_cols):
    ax = axes[idx]
    
    # Scatter plot
    ax.scatter(y_val[col], y_val_pred_df[col], alpha=0.5)
    
    # Perfect prediction line
    min_val = min(y_val[col].min(), y_val_pred_df[col].min())
    max_val = max(y_val[col].max(), y_val_pred_df[col].max())
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect')
    
    ax.set_xlabel(f'Actual {col}')
    ax.set_ylabel(f'Predicted {col}')
    ax.set_title(f'{col}\nR² = {val_r2[col]:.3f}')
    ax.legend()
    ax.grid(True, alpha=0.3)

# Remove extra subplot
fig.delaxes(axes[5])
plt.tight_layout()
plt.show()

## 8. Feature Importance

In [None]:
# Get feature importance for each target
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, (col, estimator) in enumerate(zip(target_cols, multi_rf.estimators_)):
    # Get feature importances
    importances = estimator.feature_importances_
    feature_imp_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': importances
    }).sort_values('importance', ascending=False).head(10)
    
    # Plot
    ax = axes[idx]
    ax.barh(range(len(feature_imp_df)), feature_imp_df['importance'])
    ax.set_yticks(range(len(feature_imp_df)))
    ax.set_yticklabels(feature_imp_df['feature'])
    ax.set_xlabel('Importance')
    ax.set_title(f'Top 10 Features for {col}')
    ax.invert_yaxis()

# Remove extra subplot
fig.delaxes(axes[5])
plt.tight_layout()
plt.show()

## 9. Check Prediction Consistency

Do our predictions respect the mathematical relationships?
- Dry_Total_g should ≈ Dry_Green_g + Dry_Dead_g + Dry_Clover_g
- GDM_g should ≈ Dry_Green_g + Dry_Clover_g

In [None]:
# Check total consistency
y_val_pred_df['calc_total'] = (
    y_val_pred_df['Dry_Green_g'] + 
    y_val_pred_df['Dry_Dead_g'] + 
    y_val_pred_df['Dry_Clover_g']
)
y_val_pred_df['total_diff'] = y_val_pred_df['Dry_Total_g'] - y_val_pred_df['calc_total']

print("Prediction Consistency Check:")
print(f"\nDry_Total_g vs sum of components:")
print(y_val_pred_df['total_diff'].describe())

# Check GDM consistency
y_val_pred_df['calc_gdm'] = y_val_pred_df['Dry_Green_g'] + y_val_pred_df['Dry_Clover_g']
y_val_pred_df['gdm_diff'] = y_val_pred_df['GDM_g'] - y_val_pred_df['calc_gdm']

print(f"\nGDM_g vs (Green + Clover):")
print(y_val_pred_df['gdm_diff'].describe())

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(y_val_pred_df['total_diff'], bins=30, edgecolor='black')
axes[0].set_xlabel('Dry_Total_g - (Green + Dead + Clover)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Total Biomass Prediction Consistency')
axes[0].axvline(0, color='red', linestyle='--', label='Perfect consistency')
axes[0].legend()

axes[1].hist(y_val_pred_df['gdm_diff'], bins=30, edgecolor='black')
axes[1].set_xlabel('GDM_g - (Green + Clover)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('GDM Prediction Consistency')
axes[1].axvline(0, color='red', linestyle='--', label='Perfect consistency')
axes[1].legend()

plt.tight_layout()
plt.show()

## 10. Generate Submission File

Train on full dataset and create predictions for test set.

In [None]:
# Load test data
test_df = pd.read_csv('competition/test.csv')
print(f"Test samples: {len(test_df)}")
print(f"Unique test images: {test_df['image_path'].nunique()}")
test_df.head(10)

In [None]:
# Note: The test.csv doesn't have the features we need (NDVI, Height, State, Species, Date)
# These will be provided at scoring time
# For now, let's create a submission with the mean predictions as a placeholder

print("Creating baseline submission file...")
print("Note: Test features not available yet, using mean predictions as placeholder")

# Calculate mean values from training data
mean_predictions = y.mean()
print("\nMean predictions (from training data):")
print(mean_predictions)

# Create submission
submission = test_df.copy()
submission['target'] = submission['target_name'].map(mean_predictions)

# Save
submission[['sample_id', 'target']].to_csv('submission_baseline.csv', index=False)
print("\nSubmission saved to submission_baseline.csv")
print(f"Submission shape: {submission.shape}")
submission.head(10)

## Summary

### Baseline Model Performance:
- Simple Random Forest using only tabular features (NDVI, Height, State, Species, Date)
- No image data used
- No advanced feature engineering

### Key Observations:
1. Check the validation R² scores above - which targets are easiest/hardest to predict?
2. Look at feature importance - is NDVI or Height more useful?
3. Predictions are not perfectly consistent (Total ≠ exact sum of components)

### Next Steps to Improve:
1. **Add image features** - CNNs or pre-trained models on the images
2. **Enforce consistency** - Predict components, then sum to get total (or vice versa)
3. **Better feature engineering** - Interactions, species groupings, seasonal patterns
4. **Try other models** - XGBoost, LightGBM, Neural Networks
5. **Ensemble** - Combine multiple models
6. **Cross-validation** - More robust evaluation