# CSIRO Pasture Biomass - Exploratory Data Analysis

This notebook explores the training data for the pasture biomass prediction competition.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Load the Data

In [None]:
# Load training data
train_df = pd.read_csv('competition/train.csv')

print(f"Total rows: {len(train_df)}")
print(f"Unique images: {train_df['sample_id'].str.split('__').str[0].nunique()}")
print(f"\nFirst few rows:")
train_df.head(10)

## 2. Basic Data Structure

The data is in "long format" - each image has 5 rows (one per target variable).

In [None]:
# Check data types and missing values
print("Data Info:")
print(train_df.info())
print("\nMissing values:")
print(train_df.isnull().sum())
print("\nTarget variables:")
print(train_df['target_name'].value_counts())

In [None]:
# Convert to wide format for easier analysis
# Each row = one image with all 5 targets as columns
train_wide = train_df.pivot_table(
    index=['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm'],
    columns='target_name',
    values='target'
).reset_index()

print(f"Wide format shape: {train_wide.shape}")
train_wide.head()

## 3. Target Variable Distributions

Let's examine the distribution of each biomass component.

In [None]:
# Summary statistics for each target
target_cols = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']
train_wide[target_cols].describe()

In [None]:
# Visualize distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, col in enumerate(target_cols):
    axes[idx].hist(train_wide[col], bins=50, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{col} Distribution')
    axes[idx].set_xlabel('Grams')
    axes[idx].set_ylabel('Frequency')
    axes[idx].axvline(train_wide[col].mean(), color='red', linestyle='--', label='Mean')
    axes[idx].axvline(train_wide[col].median(), color='green', linestyle='--', label='Median')
    axes[idx].legend()

# Remove extra subplot
fig.delaxes(axes[5])
plt.tight_layout()
plt.show()

In [None]:
# Check if Dry_Total_g = sum of components
train_wide['calculated_total'] = train_wide['Dry_Green_g'] + train_wide['Dry_Dead_g'] + train_wide['Dry_Clover_g']
train_wide['total_difference'] = train_wide['Dry_Total_g'] - train_wide['calculated_total']

print("Difference between Dry_Total_g and sum of components:")
print(train_wide['total_difference'].describe())
print(f"\nMax absolute difference: {train_wide['total_difference'].abs().max():.4f} grams")

## 4. Feature Analysis

Let's explore the input features: NDVI, Height, State, Species, and Sampling Date.

In [None]:
# Feature value distributions
print("NDVI Range:")
print(train_wide['Pre_GSHH_NDVI'].describe())
print("\nHeight Range (cm):")
print(train_wide['Height_Ave_cm'].describe())
print("\nStates:")
print(train_wide['State'].value_counts())
print("\nTop 10 Species Combinations:")
print(train_wide['Species'].value_counts().head(10))

In [None]:
# Convert sampling date to datetime and extract features
train_wide['Sampling_Date'] = pd.to_datetime(train_wide['Sampling_Date'])
train_wide['Month'] = train_wide['Sampling_Date'].dt.month
train_wide['Season'] = train_wide['Month'].map({
    12: 'Summer', 1: 'Summer', 2: 'Summer',
    3: 'Autumn', 4: 'Autumn', 5: 'Autumn',
    6: 'Winter', 7: 'Winter', 8: 'Winter',
    9: 'Spring', 10: 'Spring', 11: 'Spring'
})

print("Seasonal distribution:")
print(train_wide['Season'].value_counts())

## 5. Feature Correlations with Targets

How do NDVI and Height correlate with biomass?

In [None]:
# Correlation matrix
feature_cols = ['Pre_GSHH_NDVI', 'Height_Ave_cm'] + target_cols
corr_matrix = train_wide[feature_cols].corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Scatter plots: NDVI vs targets
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, target in enumerate(['Dry_Total_g', 'GDM_g', 'Dry_Green_g']):
    axes[idx].scatter(train_wide['Pre_GSHH_NDVI'], train_wide[target], alpha=0.5)
    axes[idx].set_xlabel('NDVI')
    axes[idx].set_ylabel(target)
    axes[idx].set_title(f'NDVI vs {target}')
    
    # Add trend line
    z = np.polyfit(train_wide['Pre_GSHH_NDVI'], train_wide[target], 1)
    p = np.poly1d(z)
    axes[idx].plot(train_wide['Pre_GSHH_NDVI'], p(train_wide['Pre_GSHH_NDVI']), 
                   "r--", alpha=0.8, label='Linear fit')
    axes[idx].legend()

plt.tight_layout()
plt.show()

In [None]:
# Scatter plots: Height vs targets
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, target in enumerate(['Dry_Total_g', 'GDM_g', 'Dry_Green_g']):
    axes[idx].scatter(train_wide['Height_Ave_cm'], train_wide[target], alpha=0.5)
    axes[idx].set_xlabel('Height (cm)')
    axes[idx].set_ylabel(target)
    axes[idx].set_title(f'Height vs {target}')
    
    # Add trend line
    z = np.polyfit(train_wide['Height_Ave_cm'], train_wide[target], 1)
    p = np.poly1d(z)
    axes[idx].plot(train_wide['Height_Ave_cm'], p(train_wide['Height_Ave_cm']), 
                   "r--", alpha=0.8, label='Linear fit')
    axes[idx].legend()

plt.tight_layout()
plt.show()

## 6. Sample Images

Let's visualize some sample images to understand what we're working with.

In [None]:
# Display 6 random images with their measurements
sample_images = train_wide.sample(6, random_state=42)

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, (_, row) in enumerate(sample_images.iterrows()):
    img_path = os.path.join('competition', row['image_path'])
    img = Image.open(img_path)
    
    axes[idx].imshow(img)
    axes[idx].axis('off')
    
    title = f"Total: {row['Dry_Total_g']:.1f}g | Green: {row['Dry_Green_g']:.1f}g\n"
    title += f"NDVI: {row['Pre_GSHH_NDVI']:.2f} | Height: {row['Height_Ave_cm']:.1f}cm\n"
    title += f"{row['State']} - {row['Species']}"
    axes[idx].set_title(title, fontsize=9)

plt.tight_layout()
plt.show()

## 7. Data Quality Checks

In [None]:
# Check for outliers using IQR method
def find_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

print("Outlier Analysis:\n")
for col in target_cols:
    outliers, lower, upper = find_outliers(train_wide, col)
    print(f"{col}:")
    print(f"  Outliers: {len(outliers)} ({len(outliers)/len(train_wide)*100:.1f}%)")
    print(f"  Range: [{lower:.2f}, {upper:.2f}]")
    print()

In [None]:
# Check for images with all zeros
all_zeros = train_wide[(train_wide['Dry_Total_g'] == 0)]
print(f"Images with zero total biomass: {len(all_zeros)}")
if len(all_zeros) > 0:
    print("\nExamples:")
    print(all_zeros[['image_path', 'State', 'Species', 'Pre_GSHH_NDVI']].head())

In [None]:
# Check relationship: GDM vs Green + Clover
# GDM should typically be Green + Clover (Green Dry Matter)
train_wide['expected_GDM'] = train_wide['Dry_Green_g'] + train_wide['Dry_Clover_g']
train_wide['GDM_difference'] = train_wide['GDM_g'] - train_wide['expected_GDM']

plt.figure(figsize=(10, 6))
plt.scatter(train_wide['expected_GDM'], train_wide['GDM_g'], alpha=0.5)
plt.plot([0, train_wide['expected_GDM'].max()], [0, train_wide['expected_GDM'].max()], 
         'r--', label='Perfect match')
plt.xlabel('Dry_Green_g + Dry_Clover_g')
plt.ylabel('GDM_g')
plt.title('GDM vs Green + Clover')
plt.legend()
plt.show()

print(f"GDM difference stats:")
print(train_wide['GDM_difference'].describe())

## 8. Key Insights Summary

Run all cells above to see the data, then we can discuss findings and next steps!

In [None]:
# Save the wide format for future use
train_wide.to_csv('competition/train_wide.csv', index=False)
print("Saved train_wide.csv for future analysis")