# Weather Feature Engineering (Simplified)

Adding weather features without climatology (to avoid rate limits).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from weather_api import WeatherAPIClient, calculate_daylength, get_southern_hemisphere_season

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)

## 1. Load Training Data

In [None]:
train_df = pd.read_csv('competition/train.csv')
train_wide = train_df.pivot_table(
    index=['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm'],
    columns='target_name',
    values='target'
).reset_index()

train_wide['Sampling_Date'] = pd.to_datetime(train_wide['Sampling_Date'])

print(f"Training samples: {len(train_wide)}")
print(f"States: {train_wide['State'].value_counts().to_dict()}")
print(f"Date range: {train_wide['Sampling_Date'].min()} to {train_wide['Sampling_Date'].max()}")

## 2. Fetch Weather Data

In [None]:
weather_client = WeatherAPIClient(cache_dir='weather_cache')

min_date = train_wide['Sampling_Date'].min()
max_date = train_wide['Sampling_Date'].max()

print("Fetching weather data...\n")
weather_data = {}
for state in train_wide['State'].unique():
    print(f"{state}:")
    df = weather_client.fetch_weather_data(
        state=state,
        start_date=min_date.strftime('%Y-%m-%d'),
        end_date=max_date.strftime('%Y-%m-%d'),
        days_before=30
    )
    weather_data[state] = df

print("\n✓ Weather data loaded")

## 3. Calculate All Features

In [None]:
def enrich_weather_data(df):
    """Add all weather features to a state's weather dataframe."""
    df = df.copy().sort_values('date')
    
    # Rolling features
    df['rainfall_7d'] = df['precipitation'].rolling(7, min_periods=1).sum()
    df['rainfall_30d'] = df['precipitation'].rolling(30, min_periods=1).sum()
    df['temp_max_7d'] = df['temp_max'].rolling(7, min_periods=1).mean()
    df['temp_min_7d'] = df['temp_min'].rolling(7, min_periods=1).mean()
    df['temp_mean_7d'] = df['temp_mean'].rolling(7, min_periods=1).mean()
    df['temp_mean_30d'] = df['temp_mean'].rolling(30, min_periods=1).mean()
    df['temp_range_7d'] = (df['temp_max'] - df['temp_min']).rolling(7, min_periods=1).mean()
    df['et0_7d'] = df['et0'].rolling(7, min_periods=1).sum()
    df['et0_30d'] = df['et0'].rolling(30, min_periods=1).sum()
    
    # Water balance
    df['water_balance_7d'] = df['rainfall_7d'] - df['et0_7d']
    df['water_balance_30d'] = df['rainfall_30d'] - df['et0_30d']
    
    # Days since rain
    days_counter = 0
    days_list = []
    for precip in df['precipitation']:
        if precip > 5:
            days_counter = 0
        else:
            days_counter += 1
        days_list.append(days_counter)
    df['days_since_rain'] = days_list
    
    # Daylength and season
    df['daylength'] = df.apply(lambda row: calculate_daylength(row['lat'], row['date']), axis=1)
    df['season'] = df['date'].apply(get_southern_hemisphere_season)
    
    return df

# Apply to all states
print("Calculating features...\n")
for state in weather_data.keys():
    weather_data[state] = enrich_weather_data(weather_data[state])
    print(f"✓ {state}")

print("\n✓ All features calculated")

## 4. Merge with Training Data

In [None]:
# Combine weather data
all_weather = pd.concat(weather_data.values(), ignore_index=True)

# Select features
weather_features = [
    'rainfall_7d', 'rainfall_30d',
    'temp_max_7d', 'temp_min_7d', 'temp_mean_7d', 'temp_mean_30d', 'temp_range_7d',
    'et0_7d', 'et0_30d',
    'water_balance_7d', 'water_balance_30d',
    'days_since_rain',
    'daylength', 'season'
]

weather_for_merge = all_weather[['date', 'state'] + weather_features].copy()
weather_for_merge.columns = ['Sampling_Date', 'State'] + weather_features

# Merge
train_enriched = train_wide.merge(weather_for_merge, on=['Sampling_Date', 'State'], how='left')

print(f"Original columns: {train_wide.shape[1]}")
print(f"Enriched columns: {train_enriched.shape[1]}")
print(f"New features: {len(weather_features)}")
print(f"\nFeatures added: {weather_features}")

## 5. Add NDVI Anomaly

In [None]:
# Calculate NDVI stats by state
ndvi_stats = train_enriched.groupby('State')['Pre_GSHH_NDVI'].agg(['mean', 'std']).reset_index()
ndvi_stats.columns = ['State', 'ndvi_mean', 'ndvi_std']

train_enriched = train_enriched.merge(ndvi_stats, on='State')
train_enriched['ndvi_anomaly'] = (
    (train_enriched['Pre_GSHH_NDVI'] - train_enriched['ndvi_mean']) / train_enriched['ndvi_std']
)

all_features = weather_features + ['ndvi_anomaly']
print(f"Total new features: {len(all_features)}")
print(f"\nAll features: {all_features}")

## 6. Summary Statistics

In [None]:
print("Weather feature summary:\n")
print(train_enriched[all_features].describe().T)

## 7. Correlation with Targets

In [None]:
target_cols = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']

# Key features for correlation
key_features = [
    'rainfall_7d', 'rainfall_30d', 'temp_mean_7d', 'et0_7d',
    'water_balance_7d', 'days_since_rain', 'daylength', 'ndvi_anomaly'
]

corr = train_enriched[key_features + target_cols].corr()
target_corr = corr.loc[key_features, target_cols]

plt.figure(figsize=(12, 6))
sns.heatmap(target_corr, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Weather Feature Correlations with Biomass Targets')
plt.tight_layout()
plt.show()

print("\nTop correlations with Dry_Total_g:")
print(target_corr['Dry_Total_g'].abs().sort_values(ascending=False))

## 8. Save Enriched Dataset

In [None]:
# Save
output_file = 'competition/train_enriched.csv'
train_enriched.to_csv(output_file, index=False)

print(f"✓ Saved to {output_file}")
print(f"Shape: {train_enriched.shape}")
print(f"New features: {len(all_features)}")
print(f"\nMissing values: {train_enriched[all_features].isnull().sum().sum()}")

# Preview
print("\nSample data:")
train_enriched[['image_path', 'State', 'Sampling_Date'] + all_features[:5]].head()

## Summary

**15 Weather Features Added:**
- Rainfall: 7d, 30d totals
- Temperature: 7d max/min/mean, 30d mean, 7d range
- Evapotranspiration: 7d, 30d totals
- Water balance: 7d, 30d (rain - ET)
- Days since last significant rain
- Daylength, Season
- NDVI anomaly

**Ready for modeling!**