In [5]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

def determine_season(date):
    """
    Determine crop season based on month
    Kharif: April (4) to October (10)
    Rabi: November (11) to May (5)
    """
    month = date.month
    if 4 <= month <= 10:
        return 'Kharif'
    else:
        return 'Rabi'

def remove_rupee_symbol(value):
    """Remove rupee symbol and commas, convert to float"""
    if pd.isna(value):
        return np.nan
    if isinstance(value, (int, float)):
        return float(value)
    # Remove ₹ symbol, commas, and whitespace
    cleaned = re.sub(r'[₹,\s]', '', str(value))
    try:
        return float(cleaned)
    except:
        return np.nan

def parse_max_min(max_min_str):
    """Parse 'Max - Min' column into separate max and min values"""
    if pd.isna(max_min_str):
        return np.nan, np.nan

    try:
        # Split by '-' and clean each part
        parts = str(max_min_str).split('-')
        if len(parts) == 2:
            max_val = remove_rupee_symbol(parts[0].strip())
            min_val = remove_rupee_symbol(parts[1].strip())
            return max_val, min_val
    except:
        pass

    return np.nan, np.nan

def standardize_commodity(commodity):
    """Standardize commodity name to 'Rice'"""
    return 'Rice'

def preprocess_rice_data(df):
    """
    Complete preprocessing pipeline for rice price data
    """
    # Create a copy to avoid modifying original
    df_processed = df.copy()

    # 1. Standardize commodity name to 'Rice'
    df_processed['Commodity'] = df_processed['Commodity'].apply(standardize_commodity)

    # 2. Drop Market column
    df_processed = df_processed.drop('Market', axis=1)

    # 3. Parse and separate Max-Min column
    df_processed[['Price_Max', 'Price_Min']] = df_processed['1Q Max - Min'].apply(
        lambda x: pd.Series(parse_max_min(x))
    )
    df_processed = df_processed.drop('1Q Max - Min', axis=1)

    # 4. Remove rupee symbols from price columns
    df_processed['Price_1KG'] = df_processed['1KG Price'].apply(remove_rupee_symbol)
    df_processed['Price_1Q'] = df_processed['1Q Price'].apply(remove_rupee_symbol)
    df_processed = df_processed.drop(['1KG Price', '1Q Price'], axis=1)

    # 5. Convert Date column to datetime
    df_processed['Date'] = pd.to_datetime(df_processed['Date'], format='%d/%m/%Y')

    # 6. Add Season column
    df_processed['Season'] = df_processed['Date'].apply(determine_season)

    # Sort by date for time series features
    df_processed = df_processed.sort_values('Date').reset_index(drop=True)

    # 7. Feature Engineering - Lag features
    # Create lag features for Price_1KG (1, 7, 14, 30 days)
    for lag in [1, 7, 14, 30]:
        df_processed[f'Price_1KG_Lag_{lag}'] = df_processed['Price_1KG'].shift(lag)

    # 8. Rolling averages (7, 14, 30 days)
    for window in [7, 14, 30]:
        df_processed[f'Price_1KG_RollingAvg_{window}'] = (
            df_processed['Price_1KG'].rolling(window=window, min_periods=1).mean()
        )

    # 9. Rolling standard deviation (volatility measure)
    for window in [7, 14, 30]:
        df_processed[f'Price_1KG_RollingStd_{window}'] = (
            df_processed['Price_1KG'].rolling(window=window, min_periods=1).std()
        )

    # 10. Price change features
    df_processed['Price_1KG_Change'] = df_processed['Price_1KG'].diff()
    df_processed['Price_1KG_PctChange'] = df_processed['Price_1KG'].pct_change()

    # 11. Price range (Max - Min)
    df_processed['Price_Range'] = df_processed['Price_Max'] - df_processed['Price_Min']

    # 12. Time-based features
    df_processed['Day'] = df_processed['Date'].dt.day
    df_processed['Month'] = df_processed['Date'].dt.month
    df_processed['Year'] = df_processed['Date'].dt.year
    df_processed['DayOfWeek'] = df_processed['Date'].dt.dayofweek
    df_processed['Quarter'] = df_processed['Date'].dt.quarter
    df_processed['WeekOfYear'] = df_processed['Date'].dt.isocalendar().week

    # 13. Moving average of price range
    df_processed['Price_Range_RollingAvg_7'] = (
        df_processed['Price_Range'].rolling(window=7, min_periods=1).mean()
    )

    # 14. Exponential moving average
    df_processed['Price_1KG_EMA_7'] = (
        df_processed['Price_1KG'].ewm(span=7, adjust=False).mean()
    )
    df_processed['Price_1KG_EMA_30'] = (
        df_processed['Price_1KG'].ewm(span=30, adjust=False).mean()
    )

    # Reorder columns for better readability
    cols = ['Date', 'Commodity', 'Season', 'Price_1KG', 'Price_1Q', 'Price_Max', 'Price_Min', 'Price_Range']
    other_cols = [col for col in df_processed.columns if col not in cols]
    df_processed = df_processed[cols + other_cols]

    return df_processed

# Main execution
if __name__ == "__main__":
    # Read from CSV file
    input_file = 'rice.csv'
    output_file = 'rice_prices_processed.csv'

    try:
        print(f"Reading data from {input_file}...")

        # Read CSV, skipping the first row (header text) and last row (footer)
        df = pd.read_csv(input_file, skiprows=1)

        # Remove any rows that contain 'Downloaded from' or empty rows
        df = df[~df['Commodity'].astype(str).str.contains('Downloaded|^$', na=False)]

        # Remove any completely empty rows
        df = df.dropna(how='all')

        print(f"Original data shape: {df.shape}")
        print(f"Original columns: {df.columns.tolist()}")
        print("\nFirst few rows of original data:")
        print(df.head())

        # Process the data
        print("\nProcessing data...")
        df_processed = preprocess_rice_data(df)

        # Display results
        print(f"\n✓ Processing completed!")
        print(f"Processed data shape: {df_processed.shape}")
        print(f"\nAll columns ({len(df_processed.columns)}):")
        for i, col in enumerate(df_processed.columns, 1):
            print(f"  {i}. {col}")

        print("\n" + "="*80)
        print("SAMPLE OF PROCESSED DATA (First 5 rows)")
        print("="*80)
        print(df_processed.head().to_string())

        print("\n" + "="*80)
        print("DATA STATISTICS")
        print("="*80)
        print(df_processed[['Price_1KG', 'Price_1Q', 'Price_Max', 'Price_Min', 'Price_Range']].describe())

        print("\n" + "="*80)
        print("SEASON DISTRIBUTION")
        print("="*80)
        season_counts = df_processed['Season'].value_counts()
        print(season_counts)
        print(f"\nKharif: {season_counts.get('Kharif', 0)} records ({season_counts.get('Kharif', 0)/len(df_processed)*100:.1f}%)")
        print(f"Rabi: {season_counts.get('Rabi', 0)} records ({season_counts.get('Rabi', 0)/len(df_processed)*100:.1f}%)")

        print("\n" + "="*80)
        print("DATE RANGE")
        print("="*80)
        print(f"Earliest date: {df_processed['Date'].min().strftime('%d/%m/%Y')}")
        print(f"Latest date: {df_processed['Date'].max().strftime('%d/%m/%Y')}")
        print(f"Total days covered: {(df_processed['Date'].max() - df_processed['Date'].min()).days} days")

        # Save to CSV
        df_processed.to_csv(output_file, index=False)
        print("\n" + "="*80)
        print(f"✓ SUCCESS! Processed data saved to '{output_file}'")
        print(f"Total records processed: {len(df_processed)}")
        print("="*80)

    except FileNotFoundError:
        print(f"\n✗ Error: File '{input_file}' not found.")
        print("Please ensure the CSV file is in the same directory as this script.")

    except Exception as e:
        print(f"\n✗ Error processing data: {str(e)}")
        import traceback
        traceback.print_exc()

Reading data from rice.csv...
Original data shape: (106, 6)
Original columns: ['Commodity', 'Market', '1KG Price', '1Q Price', '1Q Max - Min', 'Date']

First few rows of original data:
                        Commodity      Market 1KG Price    1Q Price  \
0                   Rice - Coarse    Kalburgi   ₹ 43.00  ₹ 4,300.00   
1                     Rice - Fine    Kalburgi   ₹ 56.00  ₹ 5,600.00   
2  Rice - CR 1009 (Coarse) Boiled  Kalagategi   ₹ 29.00  ₹ 2,900.00   
3              Rice - Broken Rice    Kalburgi   ₹ 35.00  ₹ 3,500.00   
4              Rice - Sarbati Raw   Somvarpet   ₹ 27.00  ₹ 2,700.00   

          1Q Max - Min        Date  
0  ₹ 4700 - ₹ 3,700.00  23/10/2025  
1  ₹ 6300 - ₹ 4,700.00  23/10/2025  
2  ₹ 2900 - ₹ 2,900.00  23/10/2025  
3  ₹ 4400 - ₹ 2,900.00  23/10/2025  
4  ₹ 2700 - ₹ 2,700.00  23/10/2025  

Processing data...

✓ Processing completed!
Processed data shape: (106, 29)

All columns (29):
  1. Date
  2. Commodity
  3. Season
  4. Price_1KG
  5. Price_1Q
  6.

In [14]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

def determine_season(date):
    """
    Determine crop season based on month
    Kharif: April (4) to October (10)
    Rabi: November (11) to May (5)
    """
    month = date.month
    if 4 <= month <= 10:
        return 'Kharif'
    else:
        return 'Rabi'

def remove_rupee_symbol(value):
    """Remove rupee symbol and commas, convert to float"""
    if pd.isna(value):
        return np.nan
    if isinstance(value, (int, float)):
        return float(value)
    # Remove ₹ symbol, commas, and whitespace
    cleaned = re.sub(r'[₹,\s]', '', str(value))
    try:
        return float(cleaned)
    except:
        return np.nan

def parse_max_min(max_min_str):
    """Parse 'Max - Min' column into separate max and min values"""
    if pd.isna(max_min_str):
        return np.nan, np.nan

    try:
        # Split by '-' and clean each part
        parts = str(max_min_str).split('-')
        if len(parts) == 2:
            max_val = remove_rupee_symbol(parts[0].strip())
            min_val = remove_rupee_symbol(parts[1].strip())
            return max_val, min_val
    except:
        pass

    return np.nan, np.nan

def standardize_commodity(commodity):
    """Standardize commodity name to 'Rice'"""
    return 'Tur'

def preprocess_rice_data(df):
    """
    Complete preprocessing pipeline for rice price data
    """
    # Create a copy to avoid modifying original
    df_processed = df.copy()

    # 1. Standardize commodity name to 'Rice'
    df_processed['Commodity'] = df_processed['Commodity'].apply(standardize_commodity)

    # 2. Drop Market column
    df_processed = df_processed.drop('Market', axis=1)

    # 3. Parse and separate Max-Min column
    df_processed[['Price_Max', 'Price_Min']] = df_processed['1Q Max - Min'].apply(
        lambda x: pd.Series(parse_max_min(x))
    )
    df_processed = df_processed.drop('1Q Max - Min', axis=1)

    # 4. Remove rupee symbols from price columns
    df_processed['Price_1KG'] = df_processed['1KG Price'].apply(remove_rupee_symbol)
    df_processed['Price_1Q'] = df_processed['1Q Price'].apply(remove_rupee_symbol)
    df_processed = df_processed.drop(['1KG Price', '1Q Price'], axis=1)

    # 5. Convert Date column to datetime
    df_processed['Date'] = pd.to_datetime(df_processed['Date'], format='%d/%m/%Y')

    # 6. Add Season column
    df_processed['Season'] = df_processed['Date'].apply(determine_season)

    # Sort by date for time series features
    df_processed = df_processed.sort_values('Date').reset_index(drop=True)

    # 7. Feature Engineering - Lag features
    # Create lag features for Price_1KG (1, 7, 14, 30 days)
    for lag in [1, 7, 14, 30]:
        df_processed[f'Price_1KG_Lag_{lag}'] = df_processed['Price_1KG'].shift(lag)

    # 8. Rolling averages (7, 14, 30 days)
    for window in [7, 14, 30]:
        df_processed[f'Price_1KG_RollingAvg_{window}'] = (
            df_processed['Price_1KG'].rolling(window=window, min_periods=1).mean()
        )

    # 9. Rolling standard deviation (volatility measure)
    for window in [7, 14, 30]:
        df_processed[f'Price_1KG_RollingStd_{window}'] = (
            df_processed['Price_1KG'].rolling(window=window, min_periods=1).std()
        )

    # 10. Price change features
    df_processed['Price_1KG_Change'] = df_processed['Price_1KG'].diff()
    df_processed['Price_1KG_PctChange'] = df_processed['Price_1KG'].pct_change()

    # 11. Price range (Max - Min)
    df_processed['Price_Range'] = df_processed['Price_Max'] - df_processed['Price_Min']

    # 12. Time-based features
    df_processed['Day'] = df_processed['Date'].dt.day
    df_processed['Month'] = df_processed['Date'].dt.month
    df_processed['Year'] = df_processed['Date'].dt.year
    df_processed['DayOfWeek'] = df_processed['Date'].dt.dayofweek
    df_processed['Quarter'] = df_processed['Date'].dt.quarter
    df_processed['WeekOfYear'] = df_processed['Date'].dt.isocalendar().week

    # 13. Moving average of price range
    df_processed['Price_Range_RollingAvg_7'] = (
        df_processed['Price_Range'].rolling(window=7, min_periods=1).mean()
    )

    # 14. Exponential moving average
    df_processed['Price_1KG_EMA_7'] = (
        df_processed['Price_1KG'].ewm(span=7, adjust=False).mean()
    )
    df_processed['Price_1KG_EMA_30'] = (
        df_processed['Price_1KG'].ewm(span=30, adjust=False).mean()
    )

    # Reorder columns for better readability
    cols = ['Date', 'Commodity', 'Season', 'Price_1KG', 'Price_1Q', 'Price_Max', 'Price_Min', 'Price_Range']
    other_cols = [col for col in df_processed.columns if col not in cols]
    df_processed = df_processed[cols + other_cols]

    return df_processed

# Main execution
if __name__ == "__main__":
    # Read from CSV file
    input_file = 'arhar_tur_whole.csv'
    output_file = 'tur_prices_processed.csv'

    try:
        print(f"Reading data from {input_file}...")

        # Read CSV, skipping the first row (header text) and last row (footer)
        df = pd.read_csv(input_file, skiprows=1)

        # Remove any rows that contain 'Downloaded from' or empty rows
        df = df[~df['Commodity'].astype(str).str.contains('Downloaded|^$', na=False)]

        # Remove any completely empty rows
        df = df.dropna(how='all')

        print(f"Original data shape: {df.shape}")
        print(f"Original columns: {df.columns.tolist()}")
        print("\nFirst few rows of original data:")
        print(df.head())

        # Process the data
        print("\nProcessing data...")
        df_processed = preprocess_rice_data(df)

        # Display results
        print(f"\n✓ Processing completed!")
        print(f"Processed data shape: {df_processed.shape}")
        print(f"\nAll columns ({len(df_processed.columns)}):")
        for i, col in enumerate(df_processed.columns, 1):
            print(f"  {i}. {col}")

        print("\n" + "="*80)
        print("SAMPLE OF PROCESSED DATA (First 5 rows)")
        print("="*80)
        print(df_processed.head().to_string())

        print("\n" + "="*80)
        print("DATA STATISTICS")
        print("="*80)
        print(df_processed[['Price_1KG', 'Price_1Q', 'Price_Max', 'Price_Min', 'Price_Range']].describe())

        print("\n" + "="*80)
        print("SEASON DISTRIBUTION")
        print("="*80)
        season_counts = df_processed['Season'].value_counts()
        print(season_counts)
        print(f"\nKharif: {season_counts.get('Kharif', 0)} records ({season_counts.get('Kharif', 0)/len(df_processed)*100:.1f}%)")
        print(f"Rabi: {season_counts.get('Rabi', 0)} records ({season_counts.get('Rabi', 0)/len(df_processed)*100:.1f}%)")

        print("\n" + "="*80)
        print("DATE RANGE")
        print("="*80)
        print(f"Earliest date: {df_processed['Date'].min().strftime('%d/%m/%Y')}")
        print(f"Latest date: {df_processed['Date'].max().strftime('%d/%m/%Y')}")
        print(f"Total days covered: {(df_processed['Date'].max() - df_processed['Date'].min()).days} days")

        # Save to CSV
        df_processed.to_csv(output_file, index=False)
        print("\n" + "="*80)
        print(f"✓ SUCCESS! Processed data saved to '{output_file}'")
        print(f"Total records processed: {len(df_processed)}")
        print("="*80)

    except FileNotFoundError:
        print(f"\n✗ Error: File '{input_file}' not found.")
        print("Please ensure the CSV file is in the same directory as this script.")

    except Exception as e:
        print(f"\n✗ Error processing data: {str(e)}")
        import traceback
        traceback.print_exc()

Reading data from arhar_tur_whole.csv...
Original data shape: (53, 6)
Original columns: ['Commodity', 'Market', '1KG Price', '1Q Price', '1Q Max - Min', 'Date']

First few rows of original data:
                                      Commodity       Market 1KG Price  \
0       Arhar (Tur/Red Gram)(Whole) - Angur Imp  Bhadravathi   ₹ 73.50   
1   Arhar (Tur/Red Gram)(Whole) - Arhar (Whole)  Chitradurga   ₹ 51.69   
2   Arhar (Tur/Red Gram)(Whole) - Arhar (Whole)       Haveri   ₹ 55.00   
3  Arhar (Tur/Red Gram)(Whole) - Arhar Dal(Tur)      Raichur   ₹ 60.89   
4             Arhar (Tur/Red Gram)(Whole) - Red      Raichur   ₹ 69.98   

     1Q Price         1Q Max - Min        Date  
0  ₹ 7,350.00  ₹ 7350 - ₹ 7,350.00  08/10/2025  
1  ₹ 5,169.00  ₹ 5169 - ₹ 5,169.00  06/10/2025  
2  ₹ 5,500.00         ₹ 0 - ₹ 0.00  19/05/2025  
3  ₹ 6,089.00         ₹ 0 - ₹ 0.00  08/05/2025  
4  ₹ 6,998.00         ₹ 0 - ₹ 0.00  08/05/2025  

Processing data...

✓ Processing completed!
Processed data shape:

In [15]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

In [16]:
class CropPriceForecaster:
    """
    Multi-crop price forecasting system for seasonal predictions
    """

    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.crop_encoders = {}
        self.feature_columns = []

    def prepare_data(self, df):
        """
        Prepare data for modeling
        """
        print("Preparing data for modeling...")

        # Create a copy
        data = df.copy()

        # Encode categorical variables
        le_commodity = LabelEncoder()
        data['Commodity_Encoded'] = le_commodity.fit_transform(data['Commodity'])

        le_season = LabelEncoder()
        data['Season_Encoded'] = le_season.fit_transform(data['Season'])

        # Store encoders
        self.crop_encoders['commodity'] = le_commodity
        self.crop_encoders['season'] = le_season

        # Create additional features
        data['Days_Since_Start'] = (data['Date'] - data['Date'].min()).dt.days

        # Seasonal indicators
        data['Is_Kharif'] = (data['Season'] == 'Kharif').astype(int)
        data['Is_Rabi'] = (data['Season'] == 'Rabi').astype(int)

        # Cyclical time features (sine/cosine for month)
        data['Month_Sin'] = np.sin(2 * np.pi * data['Month'] / 12)
        data['Month_Cos'] = np.cos(2 * np.pi * data['Month'] / 12)

        return data

    def select_features(self, data):
        """
        Select relevant features for modeling
        """
        # Define feature columns (excluding target and identifiers)
        exclude_cols = ['Date', 'Commodity', 'Season', 'Price_1KG', 'Price_1Q']

        feature_cols = [col for col in data.columns if col not in exclude_cols]

        # Remove columns with too many NaN values
        feature_cols = [col for col in feature_cols if data[col].isna().sum() / len(data) < 0.5]

        self.feature_columns = feature_cols
        print(f"Selected {len(feature_cols)} features for modeling")

        return feature_cols

    def train_models(self, df, target_col='Price_1KG'):
        """
        Train separate models for each crop
        """
        print("\n" + "="*80)
        print("TRAINING MODELS FOR EACH CROP")
        print("="*80)

        data = self.prepare_data(df)
        feature_cols = self.select_features(data)

        # Get unique crops
        crops = data['Commodity'].unique()

        results = []

        for crop in crops:
            print(f"\nTraining model for: {crop}")
            print("-" * 50)

            # Filter data for this crop
            crop_data = data[data['Commodity'] == crop].copy()

            # Remove rows with NaN in target
            crop_data = crop_data.dropna(subset=[target_col])

            if len(crop_data) < 10:
                print(f"  ⚠ Insufficient data for {crop} (only {len(crop_data)} records). Skipping...")
                continue

            # Prepare features and target
            X = crop_data[feature_cols].fillna(crop_data[feature_cols].median())
            y = crop_data[target_col]

            # Split data (time-series aware)
            split_idx = int(len(X) * 0.8)
            X_train, X_test = X[:split_idx], X[split_idx:]
            y_train, y_test = y[:split_idx], y[split_idx:]

            # Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Train model - using Gradient Boosting for better performance
            model = GradientBoostingRegressor(
                n_estimators=200,
                learning_rate=0.05,
                max_depth=5,
                min_samples_split=5,
                random_state=42
            )

            model.fit(X_train_scaled, y_train)

            # Evaluate
            train_pred = model.predict(X_train_scaled)
            test_pred = model.predict(X_test_scaled)

            train_mae = mean_absolute_error(y_train, train_pred)
            test_mae = mean_absolute_error(y_test, test_pred)
            train_r2 = r2_score(y_train, train_pred)
            test_r2 = r2_score(y_test, test_pred)

            print(f"  Train MAE: ₹{train_mae:.2f} | R²: {train_r2:.3f}")
            print(f"  Test MAE:  ₹{test_mae:.2f} | R²: {test_r2:.3f}")

            # Store model and scaler
            self.models[crop] = model
            self.scalers[crop] = scaler

            results.append({
                'Crop': crop,
                'Train_MAE': train_mae,
                'Test_MAE': test_mae,
                'Train_R2': train_r2,
                'Test_R2': test_r2,
                'Training_Samples': len(X_train),
                'Test_Samples': len(X_test)
            })

        results_df = pd.DataFrame(results)
        print("\n" + "="*80)
        print("MODEL TRAINING SUMMARY")
        print("="*80)
        print(results_df.to_string(index=False))

        return results_df

    def predict_seasonal_prices(self, df, forecast_year=2026):
        """
        Predict prices for next Kharif and Rabi seasons
        """
        print("\n" + "="*80)
        print(f"PREDICTING PRICES FOR YEAR {forecast_year}")
        print("="*80)

        data = self.prepare_data(df)
        crops = list(self.models.keys())

        predictions = []

        for crop in crops:
            print(f"\nGenerating predictions for: {crop}")

            # Get latest data for this crop
            crop_data = data[data['Commodity'] == crop].copy()
            latest_record = crop_data.iloc[-1]

            # Get historical averages for this crop
            kharif_avg = crop_data[crop_data['Season'] == 'Kharif']['Price_1KG'].mean()
            rabi_avg = crop_data[crop_data['Season'] == 'Rabi']['Price_1KG'].mean()

            # Define prediction dates for Kharif and Rabi
            seasons = [
                {
                    'season': 'Kharif',
                    'date': pd.Timestamp(f'{forecast_year}-07-15'),  # Mid-Kharif
                    'month': 7,
                    'is_kharif': 1,
                    'is_rabi': 0
                },
                {
                    'season': 'Rabi',
                    'date': pd.Timestamp(f'{forecast_year}-01-15'),  # Mid-Rabi
                    'month': 1,
                    'is_kharif': 0,
                    'is_rabi': 1
                }
            ]

            for season_info in seasons:
                # Create feature vector for prediction
                pred_features = latest_record[self.feature_columns].copy()

                # Update time-based features
                pred_features['Month'] = season_info['month']
                pred_features['Year'] = forecast_year
                pred_features['Is_Kharif'] = season_info['is_kharif']
                pred_features['Is_Rabi'] = season_info['is_rabi']
                pred_features['Season_Encoded'] = 0 if season_info['season'] == 'Kharif' else 1
                pred_features['Month_Sin'] = np.sin(2 * np.pi * season_info['month'] / 12)
                pred_features['Month_Cos'] = np.cos(2 * np.pi * season_info['month'] / 12)
                pred_features['Quarter'] = (season_info['month'] - 1) // 3 + 1

                # Calculate days since start
                days_since_start = (season_info['date'] - data['Date'].min()).days
                pred_features['Days_Since_Start'] = days_since_start

                # Fill any remaining NaN values
                pred_features = pred_features.fillna(crop_data[self.feature_columns].median())

                # Prepare for prediction
                X_pred = pred_features.values.reshape(1, -1)
                X_pred_scaled = self.scalers[crop].transform(X_pred)

                # Make prediction
                predicted_price = self.models[crop].predict(X_pred_scaled)[0]

                # Calculate confidence based on historical volatility
                historical_std = crop_data['Price_1KG'].std()
                confidence_interval = 1.96 * historical_std  # 95% CI

                predictions.append({
                    'Crop': crop,
                    'Season': season_info['season'],
                    'Year': forecast_year,
                    'Predicted_Price_1KG': round(predicted_price, 2),
                    'Lower_Bound': round(predicted_price - confidence_interval, 2),
                    'Upper_Bound': round(predicted_price + confidence_interval, 2),
                    'Historical_Avg': round(kharif_avg if season_info['season'] == 'Kharif' else rabi_avg, 2),
                    'Change_from_Avg': round(predicted_price - (kharif_avg if season_info['season'] == 'Kharif' else rabi_avg), 2)
                })

        predictions_df = pd.DataFrame(predictions)

        print("\n" + "="*80)
        print("SEASONAL PRICE PREDICTIONS")
        print("="*80)
        print(predictions_df.to_string(index=False))

        return predictions_df

    def save_models(self, filepath='crop_price_models.pkl'):
        """
        Save trained models and scalers
        """
        model_data = {
            'models': self.models,
            'scalers': self.scalers,
            'crop_encoders': self.crop_encoders,
            'feature_columns': self.feature_columns
        }
        joblib.dump(model_data, filepath)
        print(f"\n✓ Models saved to {filepath}")

    def load_models(self, filepath='crop_price_models.pkl'):
        """
        Load trained models and scalers
        """
        model_data = joblib.load(filepath)
        self.models = model_data['models']
        self.scalers = model_data['scalers']
        self.crop_encoders = model_data['crop_encoders']
        self.feature_columns = model_data['feature_columns']
        print(f"✓ Models loaded from {filepath}")



In [17]:
def main():
    """
    Main execution function
    """
    print("="*80)
    print("MULTI-CROP SEASONAL PRICE FORECASTING SYSTEM")
    print("="*80)

    # Load processed data
    input_files = [
        'rice_prices_processed.csv',
        'wheat_prices_processed.csv',
        'groundnut_prices_processed.csv','jowar_prices_processed.csv','onion_prices_processed.csv','potato_prices_processed.csv',
        'ragi_prices_processed.csv','soyabean_prices_processed.csv','tur_prices_processed.csv'
        # Add more crop files here
    ]

    # For demonstration, we'll use rice data
    # Replace this with actual loading of all 8 crops
    print("\nLoading data...")

    try:
        # Load all crop data and combine
        all_data = []
        for file in input_files:
            try:
                df = pd.read_csv(file)
                all_data.append(df)
                print(f"  ✓ Loaded {file}: {len(df)} records")
            except FileNotFoundError:
                print(f"  ⚠ File not found: {file}")
                continue

        if not all_data:
            print("\n⚠ No data files found. Using sample data...")
            # Load single file for demonstration
            df = pd.read_csv('rice_prices_processed.csv')
            all_data = [df]

        # Combine all crop data
        combined_data = pd.concat(all_data, ignore_index=True)
        print(f"\nTotal combined data: {len(combined_data)} records")
        print(f"Crops in dataset: {combined_data['Commodity'].unique()}")

        # Initialize forecaster
        forecaster = CropPriceForecaster()

        # Train models
        training_results = forecaster.train_models(combined_data)

        # Make predictions for next seasons
        predictions = forecaster.predict_seasonal_prices(combined_data, forecast_year=2026)

        # Save predictions to CSV
        predictions.to_csv('seasonal_price_predictions_2026.csv', index=False)
        print("\n✓ Predictions saved to 'seasonal_price_predictions_2026.csv'")

        # Save models for future use
        forecaster.save_models('crop_price_models.pkl')

        # Generate summary report
        print("\n" + "="*80)
        print("PREDICTION SUMMARY BY SEASON")
        print("="*80)

        for season in ['Kharif', 'Rabi']:
            season_data = predictions[predictions['Season'] == season]
            print(f"\n{season} Season 2026:")
            print(f"  Average predicted price: ₹{season_data['Predicted_Price_1KG'].mean():.2f}")
            print(f"  Highest: {season_data.loc[season_data['Predicted_Price_1KG'].idxmax(), 'Crop']} - ₹{season_data['Predicted_Price_1KG'].max():.2f}")
            print(f"  Lowest: {season_data.loc[season_data['Predicted_Price_1KG'].idxmin(), 'Crop']} - ₹{season_data['Predicted_Price_1KG'].min():.2f}")

        print("\n" + "="*80)
        print("✓ FORECASTING COMPLETE!")
        print("="*80)

    except Exception as e:
        print(f"\n✗ Error: {str(e)}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()

MULTI-CROP SEASONAL PRICE FORECASTING SYSTEM

Loading data...
  ✓ Loaded rice_prices_processed.csv: 106 records
  ✓ Loaded wheat_prices_processed.csv: 52 records
  ✓ Loaded groundnut_prices_processed.csv: 51 records
  ✓ Loaded jowar_prices_processed.csv: 67 records
  ✓ Loaded onion_prices_processed.csv: 58 records
  ✓ Loaded potato_prices_processed.csv: 42 records
  ✓ Loaded ragi_prices_processed.csv: 55 records
  ✓ Loaded soyabean_prices_processed.csv: 25 records
  ✓ Loaded tur_prices_processed.csv: 53 records

Total combined data: 509 records
Crops in dataset: ['Rice' 'Wheat' 'Groundnut' 'Jowar' 'Onion' 'Potato' 'Ragi' 'Soyabean'
 'Tur']

TRAINING MODELS FOR EACH CROP
Preparing data for modeling...

✗ Error: unsupported operand type(s) for -: 'str' and 'str'


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pandas/core/ops/array_ops.py", line 218, in _na_arithmetic_op
    result = func(left, right)
             ^^^^^^^^^^^^^^^^^
TypeError: unsupported operand type(s) for -: 'str' and 'str'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipython-input-3078157669.py", line 49, in main
    training_results = forecaster.train_models(combined_data)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-3261995292.py", line 70, in train_models
    data = self.prepare_data(df)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-3261995292.py", line 33, in prepare_data
    data['Days_Since_Start'] = (data['Date'] - data['Date'].min()).dt.days
                                ~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~
  File "/usr/local/lib/python3.12/dist-packages/pandas/core/ops/common.py", line 76, in new_method


In [18]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

class CropPriceForecaster:
    """
    Multi-crop price forecasting system for seasonal predictions
    """

    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.crop_encoders = {}
        self.feature_columns = []

    def prepare_data(self, df):
        """
        Prepare data for modeling
        """
        print("Preparing data for modeling...")

        # Create a copy
        data = df.copy()

        # Convert Date to datetime if it's not already
        if not pd.api.types.is_datetime64_any_dtype(data['Date']):
            print("  Converting Date column to datetime...")
            try:
                data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
            except:
                try:
                    data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')
                except:
                    data['Date'] = pd.to_datetime(data['Date'], infer_datetime_format=True)

        # Encode categorical variables
        le_commodity = LabelEncoder()
        data['Commodity_Encoded'] = le_commodity.fit_transform(data['Commodity'])

        le_season = LabelEncoder()
        data['Season_Encoded'] = le_season.fit_transform(data['Season'])

        # Store encoders
        self.crop_encoders['commodity'] = le_commodity
        self.crop_encoders['season'] = le_season

        # Create additional features
        data['Days_Since_Start'] = (data['Date'] - data['Date'].min()).dt.days

        # Seasonal indicators
        data['Is_Kharif'] = (data['Season'] == 'Kharif').astype(int)
        data['Is_Rabi'] = (data['Season'] == 'Rabi').astype(int)

        # Cyclical time features (sine/cosine for month)
        data['Month_Sin'] = np.sin(2 * np.pi * data['Month'] / 12)
        data['Month_Cos'] = np.cos(2 * np.pi * data['Month'] / 12)

        return data

    def select_features(self, data):
        """
        Select relevant features for modeling
        """
        # Define feature columns (excluding target and identifiers)
        exclude_cols = ['Date', 'Commodity', 'Season', 'Price_1KG', 'Price_1Q']

        feature_cols = [col for col in data.columns if col not in exclude_cols]

        # Remove columns with too many NaN values
        feature_cols = [col for col in feature_cols if data[col].isna().sum() / len(data) < 0.5]

        self.feature_columns = feature_cols
        print(f"Selected {len(feature_cols)} features for modeling")

        return feature_cols

    def train_models(self, df, target_col='Price_1KG'):
        """
        Train separate models for each crop
        """
        print("\n" + "="*80)
        print("TRAINING MODELS FOR EACH CROP")
        print("="*80)

        data = self.prepare_data(df)
        feature_cols = self.select_features(data)

        # Get unique crops
        crops = data['Commodity'].unique()

        results = []

        for crop in crops:
            print(f"\nTraining model for: {crop}")
            print("-" * 50)

            # Filter data for this crop
            crop_data = data[data['Commodity'] == crop].copy()

            # Remove rows with NaN in target
            crop_data = crop_data.dropna(subset=[target_col])

            if len(crop_data) < 10:
                print(f"  ⚠ Insufficient data for {crop} (only {len(crop_data)} records). Skipping...")
                continue

            # Prepare features and target
            X = crop_data[feature_cols].fillna(crop_data[feature_cols].median())
            y = crop_data[target_col]

            # Split data (time-series aware)
            split_idx = int(len(X) * 0.8)
            X_train, X_test = X[:split_idx], X[split_idx:]
            y_train, y_test = y[:split_idx], y[split_idx:]

            # Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Train model - using Gradient Boosting for better performance
            model = GradientBoostingRegressor(
                n_estimators=200,
                learning_rate=0.05,
                max_depth=5,
                min_samples_split=5,
                random_state=42
            )

            model.fit(X_train_scaled, y_train)

            # Evaluate
            train_pred = model.predict(X_train_scaled)
            test_pred = model.predict(X_test_scaled)

            train_mae = mean_absolute_error(y_train, train_pred)
            test_mae = mean_absolute_error(y_test, test_pred)
            train_r2 = r2_score(y_train, train_pred)
            test_r2 = r2_score(y_test, test_pred)

            print(f"  Train MAE: ₹{train_mae:.2f} | R²: {train_r2:.3f}")
            print(f"  Test MAE:  ₹{test_mae:.2f} | R²: {test_r2:.3f}")

            # Store model and scaler
            self.models[crop] = model
            self.scalers[crop] = scaler

            results.append({
                'Crop': crop,
                'Train_MAE': train_mae,
                'Test_MAE': test_mae,
                'Train_R2': train_r2,
                'Test_R2': test_r2,
                'Training_Samples': len(X_train),
                'Test_Samples': len(X_test)
            })

        results_df = pd.DataFrame(results)
        print("\n" + "="*80)
        print("MODEL TRAINING SUMMARY")
        print("="*80)
        print(results_df.to_string(index=False))

        return results_df

    def predict_seasonal_prices(self, df, forecast_year=2026):
        """
        Predict prices for next Kharif and Rabi seasons
        """
        print("\n" + "="*80)
        print(f"PREDICTING PRICES FOR YEAR {forecast_year}")
        print("="*80)

        data = self.prepare_data(df)
        crops = list(self.models.keys())

        predictions = []

        for crop in crops:
            print(f"\nGenerating predictions for: {crop}")

            # Get latest data for this crop
            crop_data = data[data['Commodity'] == crop].copy()
            latest_record = crop_data.iloc[-1]

            # Get historical averages for this crop
            kharif_avg = crop_data[crop_data['Season'] == 'Kharif']['Price_1KG'].mean()
            rabi_avg = crop_data[crop_data['Season'] == 'Rabi']['Price_1KG'].mean()

            # Define prediction dates for Kharif and Rabi
            seasons = [
                {
                    'season': 'Kharif',
                    'date': pd.Timestamp(f'{forecast_year}-07-15'),  # Mid-Kharif
                    'month': 7,
                    'is_kharif': 1,
                    'is_rabi': 0
                },
                {
                    'season': 'Rabi',
                    'date': pd.Timestamp(f'{forecast_year}-01-15'),  # Mid-Rabi
                    'month': 1,
                    'is_kharif': 0,
                    'is_rabi': 1
                }
            ]

            for season_info in seasons:
                # Create feature vector for prediction
                pred_features = latest_record[self.feature_columns].copy()

                # Update time-based features
                pred_features['Month'] = season_info['month']
                pred_features['Year'] = forecast_year
                pred_features['Is_Kharif'] = season_info['is_kharif']
                pred_features['Is_Rabi'] = season_info['is_rabi']
                pred_features['Season_Encoded'] = 0 if season_info['season'] == 'Kharif' else 1
                pred_features['Month_Sin'] = np.sin(2 * np.pi * season_info['month'] / 12)
                pred_features['Month_Cos'] = np.cos(2 * np.pi * season_info['month'] / 12)
                pred_features['Quarter'] = (season_info['month'] - 1) // 3 + 1

                # Calculate days since start
                days_since_start = (season_info['date'] - data['Date'].min()).days
                pred_features['Days_Since_Start'] = days_since_start

                # Fill any remaining NaN values
                pred_features = pred_features.fillna(crop_data[self.feature_columns].median())

                # Prepare for prediction
                X_pred = pred_features.values.reshape(1, -1)
                X_pred_scaled = self.scalers[crop].transform(X_pred)

                # Make prediction
                predicted_price = self.models[crop].predict(X_pred_scaled)[0]

                # Calculate confidence based on historical volatility
                historical_std = crop_data['Price_1KG'].std()
                confidence_interval = 1.96 * historical_std  # 95% CI

                predictions.append({
                    'Crop': crop,
                    'Season': season_info['season'],
                    'Year': forecast_year,
                    'Predicted_Price_1KG': round(predicted_price, 2),
                    'Lower_Bound': round(predicted_price - confidence_interval, 2),
                    'Upper_Bound': round(predicted_price + confidence_interval, 2),
                    'Historical_Avg': round(kharif_avg if season_info['season'] == 'Kharif' else rabi_avg, 2),
                    'Change_from_Avg': round(predicted_price - (kharif_avg if season_info['season'] == 'Kharif' else rabi_avg), 2)
                })

        predictions_df = pd.DataFrame(predictions)

        print("\n" + "="*80)
        print("SEASONAL PRICE PREDICTIONS")
        print("="*80)
        print(predictions_df.to_string(index=False))

        return predictions_df

    def save_models(self, filepath='crop_price_models.pkl'):
        """
        Save trained models and scalers
        """
        model_data = {
            'models': self.models,
            'scalers': self.scalers,
            'crop_encoders': self.crop_encoders,
            'feature_columns': self.feature_columns
        }
        joblib.dump(model_data, filepath)
        print(f"\n✓ Models saved to {filepath}")

    def load_models(self, filepath='crop_price_models.pkl'):
        """
        Load trained models and scalers
        """
        model_data = joblib.load(filepath)
        self.models = model_data['models']
        self.scalers = model_data['scalers']
        self.crop_encoders = model_data['crop_encoders']
        self.feature_columns = model_data['feature_columns']
        print(f"✓ Models loaded from {filepath}")


def main():
    """
    Main execution function
    """
    print("="*80)
    print("MULTI-CROP SEASONAL PRICE FORECASTING SYSTEM")
    print("="*80)

    # Load processed data
    input_files = [
        'rice_prices_processed.csv',
        'wheat_prices_processed.csv',
        'groundnut_prices_processed.csv','jowar_prices_processed.csv','onion_prices_processed.csv','potato_prices_processed.csv',
        'ragi_prices_processed.csv','soyabean_prices_processed.csv','tur_prices_processed.csv'
        # Add more crop files here
    ]

    # For demonstration, we'll use rice data
    # Replace this with actual loading of all 8 crops
    print("\nLoading data...")

    try:
        # Load all crop data and combine
        all_data = []
        for file in input_files:
            try:
                df = pd.read_csv(file)
                all_data.append(df)
                print(f"  ✓ Loaded {file}: {len(df)} records")
            except FileNotFoundError:
                print(f"  ⚠ File not found: {file}")
                continue

        if not all_data:
            print("\n⚠ No data files found. Using sample data...")
            # Load single file for demonstration
            df = pd.read_csv('rice_prices_processed.csv')
            all_data = [df]

        # Combine all crop data
        combined_data = pd.concat(all_data, ignore_index=True)

        # Ensure Date column is datetime
        if not pd.api.types.is_datetime64_any_dtype(combined_data['Date']):
            print("\nConverting Date column to datetime format...")
            try:
                combined_data['Date'] = pd.to_datetime(combined_data['Date'], format='%d/%m/%Y')
            except:
                try:
                    combined_data['Date'] = pd.to_datetime(combined_data['Date'], format='%Y-%m-%d')
                except:
                    combined_data['Date'] = pd.to_datetime(combined_data['Date'], infer_datetime_format=True)

        print(f"\nTotal combined data: {len(combined_data)} records")
        print(f"Crops in dataset: {combined_data['Commodity'].unique()}")
        print(f"Date range: {combined_data['Date'].min()} to {combined_data['Date'].max()}")

        # Initialize forecaster
        forecaster = CropPriceForecaster()

        # Train models
        training_results = forecaster.train_models(combined_data)

        # Make predictions for next seasons
        predictions = forecaster.predict_seasonal_prices(combined_data, forecast_year=2026)

        # Save predictions to CSV
        predictions.to_csv('seasonal_price_predictions_2026.csv', index=False)
        print("\n✓ Predictions saved to 'seasonal_price_predictions_2026.csv'")

        # Save models for future use
        forecaster.save_models('crop_price_models.pkl')

        # Generate summary report
        print("\n" + "="*80)
        print("PREDICTION SUMMARY BY SEASON")
        print("="*80)

        for season in ['Kharif', 'Rabi']:
            season_data = predictions[predictions['Season'] == season]
            print(f"\n{season} Season 2026:")
            print(f"  Average predicted price: ₹{season_data['Predicted_Price_1KG'].mean():.2f}")
            print(f"  Highest: {season_data.loc[season_data['Predicted_Price_1KG'].idxmax(), 'Crop']} - ₹{season_data['Predicted_Price_1KG'].max():.2f}")
            print(f"  Lowest: {season_data.loc[season_data['Predicted_Price_1KG'].idxmin(), 'Crop']} - ₹{season_data['Predicted_Price_1KG'].min():.2f}")

        print("\n" + "="*80)
        print("✓ FORECASTING COMPLETE!")
        print("="*80)

    except Exception as e:
        print(f"\n✗ Error: {str(e)}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()

MULTI-CROP SEASONAL PRICE FORECASTING SYSTEM

Loading data...
  ✓ Loaded rice_prices_processed.csv: 106 records
  ✓ Loaded wheat_prices_processed.csv: 52 records
  ✓ Loaded groundnut_prices_processed.csv: 51 records
  ✓ Loaded jowar_prices_processed.csv: 67 records
  ✓ Loaded onion_prices_processed.csv: 58 records
  ✓ Loaded potato_prices_processed.csv: 42 records
  ✓ Loaded ragi_prices_processed.csv: 55 records
  ✓ Loaded soyabean_prices_processed.csv: 25 records
  ✓ Loaded tur_prices_processed.csv: 53 records

Converting Date column to datetime format...

Total combined data: 509 records
Crops in dataset: ['Rice' 'Wheat' 'Groundnut' 'Jowar' 'Onion' 'Potato' 'Ragi' 'Soyabean'
 'Tur']
Date range: 2022-08-04 00:00:00 to 2025-10-23 00:00:00

TRAINING MODELS FOR EACH CROP
Preparing data for modeling...
Selected 30 features for modeling

Training model for: Rice
--------------------------------------------------
  Train MAE: ₹0.00 | R²: 1.000
  Test MAE:  ₹1.53 | R²: 0.912

Training model 

In [19]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

class CropPriceForecaster:
    """
    Multi-crop price forecasting system for seasonal predictions
    """

    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.crop_encoders = {}
        self.feature_columns = []

    def prepare_data(self, df):
        """
        Prepare data for modeling
        """
        print("Preparing data for modeling...")

        # Create a copy
        data = df.copy()

        # Convert Date to datetime if it's not already
        if not pd.api.types.is_datetime64_any_dtype(data['Date']):
            print("  Converting Date column to datetime...")
            try:
                data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
            except:
                try:
                    data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')
                except:
                    data['Date'] = pd.to_datetime(data['Date'], infer_datetime_format=True)

        # Encode categorical variables
        le_commodity = LabelEncoder()
        data['Commodity_Encoded'] = le_commodity.fit_transform(data['Commodity'])

        le_season = LabelEncoder()
        data['Season_Encoded'] = le_season.fit_transform(data['Season'])

        # Store encoders
        self.crop_encoders['commodity'] = le_commodity
        self.crop_encoders['season'] = le_season

        # Create additional features
        data['Days_Since_Start'] = (data['Date'] - data['Date'].min()).dt.days

        # Seasonal indicators
        data['Is_Kharif'] = (data['Season'] == 'Kharif').astype(int)
        data['Is_Rabi'] = (data['Season'] == 'Rabi').astype(int)

        # Cyclical time features (sine/cosine for month)
        data['Month_Sin'] = np.sin(2 * np.pi * data['Month'] / 12)
        data['Month_Cos'] = np.cos(2 * np.pi * data['Month'] / 12)

        return data

    def select_features(self, data):
        """
        Select relevant features for modeling - simplified to reduce overfitting
        """
        # Use only the most important features to avoid overfitting
        important_features = [
            'Commodity_Encoded',
            'Season_Encoded',
            'Month',
            'Year',
            'Is_Kharif',
            'Is_Rabi',
            'Month_Sin',
            'Month_Cos',
            'Quarter',
            'Days_Since_Start',
            'Price_1KG_Lag_1',
            'Price_1KG_Lag_7',
            'Price_1KG_RollingAvg_7',
            'Price_Max',
            'Price_Min',
            'Price_Range'
        ]

        # Only keep features that exist in the data
        feature_cols = [col for col in important_features if col in data.columns]

        # Remove columns with too many NaN values (>30%)
        feature_cols = [col for col in feature_cols if data[col].isna().sum() / len(data) < 0.3]

        self.feature_columns = feature_cols
        print(f"Selected {len(feature_cols)} features for modeling")
        print(f"Features: {feature_cols}")

        return feature_cols

    def train_models(self, df, target_col='Price_1KG'):
        """
        Train separate models for each crop
        """
        print("\n" + "="*80)
        print("TRAINING MODELS FOR EACH CROP")
        print("="*80)

        data = self.prepare_data(df)
        feature_cols = self.select_features(data)

        # Get unique crops
        crops = data['Commodity'].unique()

        results = []

        for crop in crops:
            print(f"\nTraining model for: {crop}")
            print("-" * 50)

            # Filter data for this crop
            crop_data = data[data['Commodity'] == crop].copy()

            # Remove rows with NaN in target
            crop_data = crop_data.dropna(subset=[target_col])

            if len(crop_data) < 20:
                print(f"  ⚠ Insufficient data for {crop} (only {len(crop_data)} records). Skipping...")
                continue

            # Prepare features and target
            X = crop_data[feature_cols].copy()
            y = crop_data[target_col]

            # Fill NaN with forward fill then backward fill, then median
            X = X.fillna(method='ffill').fillna(method='bfill').fillna(X.median())

            # Split data (time-series aware) - use more data for training
            split_idx = int(len(X) * 0.85)  # 85/15 split
            X_train, X_test = X[:split_idx], X[split_idx:]
            y_train, y_test = y[:split_idx], y[split_idx:]

            print(f"  Training samples: {len(X_train)}, Test samples: {len(X_test)}")

            # Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Train model - using simpler model to reduce overfitting
            model = GradientBoostingRegressor(
                n_estimators=50,           # Reduced from 200
                learning_rate=0.1,         # Increased for faster, simpler learning
                max_depth=3,               # Reduced from 5
                min_samples_split=10,      # Increased from 5
                min_samples_leaf=5,        # Added constraint
                subsample=0.8,             # Use 80% of samples per tree
                max_features='sqrt',       # Use sqrt of features per split
                random_state=42
            )

            model.fit(X_train_scaled, y_train)

            # Evaluate
            train_pred = model.predict(X_train_scaled)
            test_pred = model.predict(X_test_scaled)

            train_mae = mean_absolute_error(y_train, train_pred)
            test_mae = mean_absolute_error(y_test, test_pred)
            train_r2 = r2_score(y_train, train_pred)
            test_r2 = r2_score(y_test, test_pred)

            # Calculate MAPE for better interpretability
            train_mape = np.mean(np.abs((y_train - train_pred) / y_train)) * 100
            test_mape = np.mean(np.abs((y_test - test_pred) / y_test)) * 100

            print(f"  Train MAE: ₹{train_mae:.2f} | R²: {train_r2:.3f} | MAPE: {train_mape:.1f}%")
            print(f"  Test MAE:  ₹{test_mae:.2f} | R²: {test_r2:.3f} | MAPE: {test_mape:.1f}%")

            # Check for overfitting
            if train_r2 > 0.95 and test_r2 < 0.5:
                print(f"  ⚠ WARNING: Severe overfitting detected!")
            elif train_mae < 0.5:
                print(f"  ⚠ WARNING: Training error suspiciously low - possible overfitting")

            # Store model and scaler
            self.models[crop] = model
            self.scalers[crop] = scaler

            results.append({
                'Crop': crop,
                'Train_MAE': train_mae,
                'Test_MAE': test_mae,
                'Train_R2': train_r2,
                'Test_R2': test_r2,
                'Train_MAPE': train_mape,
                'Test_MAPE': test_mape,
                'Training_Samples': len(X_train),
                'Test_Samples': len(X_test),
                'Overfitting': 'Yes' if (train_r2 > 0.95 and test_r2 < 0.5) else 'No'
            })

        results_df = pd.DataFrame(results)
        print("\n" + "="*80)
        print("MODEL TRAINING SUMMARY")
        print("="*80)
        print(results_df.to_string(index=False))

        return results_df

    def predict_seasonal_prices(self, df, forecast_year=2026):
        """
        Predict prices for next Kharif and Rabi seasons
        """
        print("\n" + "="*80)
        print(f"PREDICTING PRICES FOR YEAR {forecast_year}")
        print("="*80)

        data = self.prepare_data(df)
        crops = list(self.models.keys())

        predictions = []

        for crop in crops:
            print(f"\nGenerating predictions for: {crop}")

            # Get latest data for this crop
            crop_data = data[data['Commodity'] == crop].copy()
            latest_record = crop_data.iloc[-1]

            # Get historical averages for this crop
            kharif_avg = crop_data[crop_data['Season'] == 'Kharif']['Price_1KG'].mean()
            rabi_avg = crop_data[crop_data['Season'] == 'Rabi']['Price_1KG'].mean()

            # Define prediction dates for Kharif and Rabi
            seasons = [
                {
                    'season': 'Kharif',
                    'date': pd.Timestamp(f'{forecast_year}-07-15'),  # Mid-Kharif
                    'month': 7,
                    'is_kharif': 1,
                    'is_rabi': 0
                },
                {
                    'season': 'Rabi',
                    'date': pd.Timestamp(f'{forecast_year}-01-15'),  # Mid-Rabi
                    'month': 1,
                    'is_kharif': 0,
                    'is_rabi': 1
                }
            ]

            for season_info in seasons:
                # Create feature vector for prediction
                pred_features = latest_record[self.feature_columns].copy()

                # Update time-based features
                pred_features['Month'] = season_info['month']
                pred_features['Year'] = forecast_year
                pred_features['Is_Kharif'] = season_info['is_kharif']
                pred_features['Is_Rabi'] = season_info['is_rabi']
                pred_features['Season_Encoded'] = 0 if season_info['season'] == 'Kharif' else 1
                pred_features['Month_Sin'] = np.sin(2 * np.pi * season_info['month'] / 12)
                pred_features['Month_Cos'] = np.cos(2 * np.pi * season_info['month'] / 12)
                pred_features['Quarter'] = (season_info['month'] - 1) // 3 + 1

                # Calculate days since start
                days_since_start = (season_info['date'] - data['Date'].min()).days
                pred_features['Days_Since_Start'] = days_since_start

                # Fill any remaining NaN values
                pred_features = pred_features.fillna(crop_data[self.feature_columns].median())

                # Prepare for prediction
                X_pred = pred_features.values.reshape(1, -1)
                X_pred_scaled = self.scalers[crop].transform(X_pred)

                # Make prediction
                predicted_price = self.models[crop].predict(X_pred_scaled)[0]

                # Calculate confidence based on historical volatility
                historical_std = crop_data['Price_1KG'].std()
                confidence_interval = 1.96 * historical_std  # 95% CI

                predictions.append({
                    'Crop': crop,
                    'Season': season_info['season'],
                    'Year': forecast_year,
                    'Predicted_Price_1KG': round(predicted_price, 2),
                    'Lower_Bound': round(predicted_price - confidence_interval, 2),
                    'Upper_Bound': round(predicted_price + confidence_interval, 2),
                    'Historical_Avg': round(kharif_avg if season_info['season'] == 'Kharif' else rabi_avg, 2),
                    'Change_from_Avg': round(predicted_price - (kharif_avg if season_info['season'] == 'Kharif' else rabi_avg), 2)
                })

        predictions_df = pd.DataFrame(predictions)

        print("\n" + "="*80)
        print("SEASONAL PRICE PREDICTIONS")
        print("="*80)
        print(predictions_df.to_string(index=False))

        return predictions_df

    def save_models(self, filepath='crop_price_models.pkl'):
        """
        Save trained models and scalers
        """
        model_data = {
            'models': self.models,
            'scalers': self.scalers,
            'crop_encoders': self.crop_encoders,
            'feature_columns': self.feature_columns
        }
        joblib.dump(model_data, filepath)
        print(f"\n✓ Models saved to {filepath}")

    def load_models(self, filepath='crop_price_models_v2.pkl'):
        """
        Load trained models and scalers
        """
        model_data = joblib.load(filepath)
        self.models = model_data['models']
        self.scalers = model_data['scalers']
        self.crop_encoders = model_data['crop_encoders']
        self.feature_columns = model_data['feature_columns']
        print(f"✓ Models loaded from {filepath}")


def main():
    """
    Main execution function
    """
    print("="*80)
    print("MULTI-CROP SEASONAL PRICE FORECASTING SYSTEM")
    print("="*80)

    # Load processed data
    input_files = [
        'rice_prices_processed.csv',
        'wheat_prices_processed.csv',
        'groundnut_prices_processed.csv','jowar_prices_processed.csv','onion_prices_processed.csv','potato_prices_processed.csv',
        'ragi_prices_processed.csv','soyabean_prices_processed.csv','tur_prices_processed.csv'
        # Add more crop files here
    ]

    # For demonstration, we'll use rice data
    # Replace this with actual loading of all 8 crops
    print("\nLoading data...")

    try:
        # Load all crop data and combine
        all_data = []
        for file in input_files:
            try:
                df = pd.read_csv(file)
                all_data.append(df)
                print(f"  ✓ Loaded {file}: {len(df)} records")
            except FileNotFoundError:
                print(f"  ⚠ File not found: {file}")
                continue

        if not all_data:
            print("\n⚠ No data files found. Using sample data...")
            # Load single file for demonstration
            df = pd.read_csv('rice_prices_processed.csv')
            all_data = [df]

        # Combine all crop data
        combined_data = pd.concat(all_data, ignore_index=True)

        # Ensure Date column is datetime
        if not pd.api.types.is_datetime64_any_dtype(combined_data['Date']):
            print("\nConverting Date column to datetime format...")
            try:
                combined_data['Date'] = pd.to_datetime(combined_data['Date'], format='%d/%m/%Y')
            except:
                try:
                    combined_data['Date'] = pd.to_datetime(combined_data['Date'], format='%Y-%m-%d')
                except:
                    combined_data['Date'] = pd.to_datetime(combined_data['Date'], infer_datetime_format=True)

        print(f"\nTotal combined data: {len(combined_data)} records")
        print(f"Crops in dataset: {combined_data['Commodity'].unique()}")
        print(f"Date range: {combined_data['Date'].min()} to {combined_data['Date'].max()}")

        # Initialize forecaster
        forecaster = CropPriceForecaster()

        # Train models
        training_results = forecaster.train_models(combined_data)

        # Make predictions for next seasons
        predictions = forecaster.predict_seasonal_prices(combined_data, forecast_year=2026)

        # Save predictions to CSV
        predictions.to_csv('seasonal_price_predictions_2026_v2.csv', index=False)
        print("\n✓ Predictions saved to 'seasonal_price_predictions_2026_v2.csv'")

        # Save models for future use
        forecaster.save_models('crop_price_models.pkl')

        # Generate summary report
        print("\n" + "="*80)
        print("PREDICTION SUMMARY BY SEASON")
        print("="*80)

        for season in ['Kharif', 'Rabi']:
            season_data = predictions[predictions['Season'] == season]
            print(f"\n{season} Season 2026:")
            print(f"  Average predicted price: ₹{season_data['Predicted_Price_1KG'].mean():.2f}")
            print(f"  Highest: {season_data.loc[season_data['Predicted_Price_1KG'].idxmax(), 'Crop']} - ₹{season_data['Predicted_Price_1KG'].max():.2f}")
            print(f"  Lowest: {season_data.loc[season_data['Predicted_Price_1KG'].idxmin(), 'Crop']} - ₹{season_data['Predicted_Price_1KG'].min():.2f}")

        print("\n" + "="*80)
        print("✓ FORECASTING COMPLETE!")
        print("="*80)

    except Exception as e:
        print(f"\n✗ Error: {str(e)}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()

MULTI-CROP SEASONAL PRICE FORECASTING SYSTEM

Loading data...
  ✓ Loaded rice_prices_processed.csv: 106 records
  ✓ Loaded wheat_prices_processed.csv: 52 records
  ✓ Loaded groundnut_prices_processed.csv: 51 records
  ✓ Loaded jowar_prices_processed.csv: 67 records
  ✓ Loaded onion_prices_processed.csv: 58 records
  ✓ Loaded potato_prices_processed.csv: 42 records
  ✓ Loaded ragi_prices_processed.csv: 55 records
  ✓ Loaded soyabean_prices_processed.csv: 25 records
  ✓ Loaded tur_prices_processed.csv: 53 records

Converting Date column to datetime format...

Total combined data: 509 records
Crops in dataset: ['Rice' 'Wheat' 'Groundnut' 'Jowar' 'Onion' 'Potato' 'Ragi' 'Soyabean'
 'Tur']
Date range: 2022-08-04 00:00:00 to 2025-10-23 00:00:00

TRAINING MODELS FOR EACH CROP
Preparing data for modeling...
Selected 16 features for modeling
Features: ['Commodity_Encoded', 'Season_Encoded', 'Month', 'Year', 'Is_Kharif', 'Is_Rabi', 'Month_Sin', 'Month_Cos', 'Quarter', 'Days_Since_Start', 'Price_

In [21]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

class CropPriceForecaster:
    """
    Multi-crop price forecasting system for seasonal predictions
    """

    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.crop_encoders = {}
        self.feature_columns = []

    def prepare_data(self, df):
        """
        Prepare data for modeling
        """
        print("Preparing data for modeling...")

        # Create a copy
        data = df.copy()

        # Convert Date to datetime if it's not already
        if not pd.api.types.is_datetime64_any_dtype(data['Date']):
            print("  Converting Date column to datetime...")
            try:
                data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
            except:
                try:
                    data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')
                except:
                    data['Date'] = pd.to_datetime(data['Date'], infer_datetime_format=True)

        # Encode categorical variables
        le_commodity = LabelEncoder()
        data['Commodity_Encoded'] = le_commodity.fit_transform(data['Commodity'])

        le_season = LabelEncoder()
        data['Season_Encoded'] = le_season.fit_transform(data['Season'])

        # Store encoders
        self.crop_encoders['commodity'] = le_commodity
        self.crop_encoders['season'] = le_season

        # Create additional features
        data['Days_Since_Start'] = (data['Date'] - data['Date'].min()).dt.days

        # Seasonal indicators
        data['Is_Kharif'] = (data['Season'] == 'Kharif').astype(int)
        data['Is_Rabi'] = (data['Season'] == 'Rabi').astype(int)

        # Cyclical time features (sine/cosine for month)
        data['Month_Sin'] = np.sin(2 * np.pi * data['Month'] / 12)
        data['Month_Cos'] = np.cos(2 * np.pi * data['Month'] / 12)

        return data

    def select_features(self, data):
        """
        Select relevant features for modeling - simplified to reduce overfitting
        """
        # Use only the most important features to avoid overfitting
        important_features = [
            'Commodity_Encoded',
            'Season_Encoded',
            'Month',
            'Year',
            'Is_Kharif',
            'Is_Rabi',
            'Month_Sin',
            'Month_Cos',
            'Quarter',
            'Days_Since_Start',
            'Price_1KG_Lag_1',
            'Price_1KG_Lag_7',
            'Price_1KG_RollingAvg_7',
            'Price_Max',
            'Price_Min',
            'Price_Range'
        ]

        # Only keep features that exist in the data
        feature_cols = [col for col in important_features if col in data.columns]

        # Remove columns with too many NaN values (>30%)
        feature_cols = [col for col in feature_cols if data[col].isna().sum() / len(data) < 0.3]

        self.feature_columns = feature_cols
        print(f"Selected {len(feature_cols)} features for modeling")
        print(f"Features: {feature_cols}")

        return feature_cols

    def train_models(self, df, target_col='Price_1KG', use_unified_model=True):
        """
        Train models - can use either separate models per crop or one unified model

        Args:
            use_unified_model: If True, trains one model for all crops (better for small datasets)
                              If False, trains separate models per crop
        """
        print("\n" + "="*80)
        if use_unified_model:
            print("TRAINING UNIFIED MODEL FOR ALL CROPS (Recommended for small datasets)")
        else:
            print("TRAINING SEPARATE MODELS FOR EACH CROP")
        print("="*80)

        data = self.prepare_data(df)
        feature_cols = self.select_features(data)

        # Get unique crops
        crops = data['Commodity'].unique()

        if use_unified_model:
            return self._train_unified_model(data, crops, feature_cols, target_col)
        else:
            return self._train_separate_models(data, crops, feature_cols, target_col)

    def _train_unified_model(self, data, crops, feature_cols, target_col):
        """
        Train one model for all crops together
        """
        print(f"\nTraining unified model on {len(data)} total samples...")
        print("-" * 50)

        # Remove rows with NaN in target
        data = data.dropna(subset=[target_col])

        # Prepare features and target
        X = data[feature_cols].copy()
        y = data[target_col]

        # Fill NaN
        X = X.fillna(method='ffill').fillna(method='bfill').fillna(X.median())

        # Split data (time-series aware)
        split_idx = int(len(X) * 0.85)
        X_train, X_test = X[:split_idx], X[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]

        # Keep track of which crop each sample belongs to for evaluation
        crop_train = data['Commodity'][:split_idx]
        crop_test = data['Commodity'][split_idx:]

        print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Train unified model
        model = GradientBoostingRegressor(
            n_estimators=100,
            learning_rate=0.08,
            max_depth=4,
            min_samples_split=15,
            min_samples_leaf=8,
            subsample=0.8,
            max_features='sqrt',
            random_state=42
        )

        model.fit(X_train_scaled, y_train)

        # Evaluate
        train_pred = model.predict(X_train_scaled)
        test_pred = model.predict(X_test_scaled)

        train_mae = mean_absolute_error(y_train, train_pred)
        test_mae = mean_absolute_error(y_test, test_pred)
        train_r2 = r2_score(y_train, train_pred)
        test_r2 = r2_score(y_test, test_pred)
        train_mape = np.mean(np.abs((y_train - train_pred) / y_train)) * 100
        test_mape = np.mean(np.abs((y_test - test_pred) / y_test)) * 100

        print(f"\nOverall Performance:")
        print(f"  Train MAE: ₹{train_mae:.2f} | R²: {train_r2:.3f} | MAPE: {train_mape:.1f}%")
        print(f"  Test MAE:  ₹{test_mae:.2f} | R²: {test_r2:.3f} | MAPE: {test_mape:.1f}%")

        # Store unified model for all crops
        for crop in crops:
            self.models[crop] = model
            self.scalers[crop] = scaler

        # Evaluate per crop on test set
        print(f"\nPer-Crop Test Performance:")
        print("-" * 50)

        results = []
        for crop in crops:
            crop_mask = crop_test == crop
            if crop_mask.sum() > 0:
                crop_y_test = y_test[crop_mask]
                crop_pred = test_pred[crop_mask]

                crop_mae = mean_absolute_error(crop_y_test, crop_pred)
                crop_r2 = r2_score(crop_y_test, crop_pred) if len(crop_y_test) > 1 else 0
                crop_mape = np.mean(np.abs((crop_y_test - crop_pred) / crop_y_test)) * 100

                print(f"  {crop:15s} - MAE: ₹{crop_mae:6.2f} | R²: {crop_r2:6.3f} | MAPE: {crop_mape:5.1f}% | Samples: {crop_mask.sum()}")

                results.append({
                    'Crop': crop,
                    'Test_MAE': crop_mae,
                    'Test_R2': crop_r2,
                    'Test_MAPE': crop_mape,
                    'Test_Samples': crop_mask.sum(),
                    'Model_Type': 'Unified'
                })

        results_df = pd.DataFrame(results)
        return results_df

    def _train_separate_models(self, data, crops, feature_cols, target_col):
        """
        Train separate models for each crop (original approach)
        """
        results = []

        for crop in crops:
            print(f"\nTraining model for: {crop}")
            print("-" * 50)

            # Filter data for this crop
            crop_data = data[data['Commodity'] == crop].copy()

            # Remove rows with NaN in target
            crop_data = crop_data.dropna(subset=[target_col])

            if len(crop_data) < 20:
                print(f"  ⚠ Insufficient data for {crop} (only {len(crop_data)} records). Skipping...")
                continue

            # Prepare features and target
            X = crop_data[feature_cols].copy()
            y = crop_data[target_col]

            # Fill NaN with forward fill then backward fill, then median
            X = X.fillna(method='ffill').fillna(method='bfill').fillna(X.median())

            # Split data (time-series aware) - use more data for training
            split_idx = int(len(X) * 0.85)  # 85/15 split
            X_train, X_test = X[:split_idx], X[split_idx:]
            y_train, y_test = y[:split_idx], y[split_idx:]

            print(f"  Training samples: {len(X_train)}, Test samples: {len(X_test)}")

            # Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Train model - using simpler model to reduce overfitting
            model = GradientBoostingRegressor(
                n_estimators=50,           # Reduced from 200
                learning_rate=0.1,         # Increased for faster, simpler learning
                max_depth=3,               # Reduced from 5
                min_samples_split=10,      # Increased from 5
                min_samples_leaf=5,        # Added constraint
                subsample=0.8,             # Use 80% of samples per tree
                max_features='sqrt',       # Use sqrt of features per split
                random_state=42
            )

            model.fit(X_train_scaled, y_train)

            # Evaluate
            train_pred = model.predict(X_train_scaled)
            test_pred = model.predict(X_test_scaled)

            train_mae = mean_absolute_error(y_train, train_pred)
            test_mae = mean_absolute_error(y_test, test_pred)
            train_r2 = r2_score(y_train, train_pred)
            test_r2 = r2_score(y_test, test_pred)

            # Calculate MAPE for better interpretability
            train_mape = np.mean(np.abs((y_train - train_pred) / y_train)) * 100
            test_mape = np.mean(np.abs((y_test - test_pred) / y_test)) * 100

            print(f"  Train MAE: ₹{train_mae:.2f} | R²: {train_r2:.3f} | MAPE: {train_mape:.1f}%")
            print(f"  Test MAE:  ₹{test_mae:.2f} | R²: {test_r2:.3f} | MAPE: {test_mape:.1f}%")

            # Check for overfitting
            if train_r2 > 0.95 and test_r2 < 0.5:
                print(f"  ⚠ WARNING: Severe overfitting detected!")
            elif train_mae < 0.5:
                print(f"  ⚠ WARNING: Training error suspiciously low - possible overfitting")

            # Store model and scaler
            self.models[crop] = model
            self.scalers[crop] = scaler

            results.append({
                'Crop': crop,
                'Train_MAE': train_mae,
                'Test_MAE': test_mae,
                'Train_R2': train_r2,
                'Test_R2': test_r2,
                'Train_MAPE': train_mape,
                'Test_MAPE': test_mape,
                'Training_Samples': len(X_train),
                'Test_Samples': len(X_test),
                'Overfitting': 'Yes' if (train_r2 > 0.95 and test_r2 < 0.5) else 'No'
            })

        results_df = pd.DataFrame(results)
        print("\n" + "="*80)
        print("MODEL TRAINING SUMMARY")
        print("="*80)
        print(results_df.to_string(index=False))

        return results_df

    def predict_seasonal_prices(self, df, forecast_year=2026):
        """
        Predict prices for next Kharif and Rabi seasons
        """
        print("\n" + "="*80)
        print(f"PREDICTING PRICES FOR YEAR {forecast_year}")
        print("="*80)

        data = self.prepare_data(df)
        crops = list(self.models.keys())

        predictions = []

        for crop in crops:
            print(f"\nGenerating predictions for: {crop}")

            # Get latest data for this crop
            crop_data = data[data['Commodity'] == crop].copy()
            latest_record = crop_data.iloc[-1]

            # Get historical averages for this crop
            kharif_avg = crop_data[crop_data['Season'] == 'Kharif']['Price_1KG'].mean()
            rabi_avg = crop_data[crop_data['Season'] == 'Rabi']['Price_1KG'].mean()

            # Define prediction dates for Kharif and Rabi
            seasons = [
                {
                    'season': 'Kharif',
                    'date': pd.Timestamp(f'{forecast_year}-07-15'),  # Mid-Kharif
                    'month': 7,
                    'is_kharif': 1,
                    'is_rabi': 0
                },
                {
                    'season': 'Rabi',
                    'date': pd.Timestamp(f'{forecast_year}-01-15'),  # Mid-Rabi
                    'month': 1,
                    'is_kharif': 0,
                    'is_rabi': 1
                }
            ]

            for season_info in seasons:
                # Create feature vector for prediction
                pred_features = latest_record[self.feature_columns].copy()

                # Update time-based features
                pred_features['Month'] = season_info['month']
                pred_features['Year'] = forecast_year
                pred_features['Is_Kharif'] = season_info['is_kharif']
                pred_features['Is_Rabi'] = season_info['is_rabi']
                pred_features['Season_Encoded'] = 0 if season_info['season'] == 'Kharif' else 1
                pred_features['Month_Sin'] = np.sin(2 * np.pi * season_info['month'] / 12)
                pred_features['Month_Cos'] = np.cos(2 * np.pi * season_info['month'] / 12)
                pred_features['Quarter'] = (season_info['month'] - 1) // 3 + 1

                # Calculate days since start
                days_since_start = (season_info['date'] - data['Date'].min()).days
                pred_features['Days_Since_Start'] = days_since_start

                # Fill any remaining NaN values
                pred_features = pred_features.fillna(crop_data[self.feature_columns].median())

                # Prepare for prediction
                X_pred = pred_features.values.reshape(1, -1)
                X_pred_scaled = self.scalers[crop].transform(X_pred)

                # Make prediction
                predicted_price = self.models[crop].predict(X_pred_scaled)[0]

                # Calculate confidence based on historical volatility
                historical_std = crop_data['Price_1KG'].std()
                confidence_interval = 1.96 * historical_std  # 95% CI

                predictions.append({
                    'Crop': crop,
                    'Season': season_info['season'],
                    'Year': forecast_year,
                    'Predicted_Price_1KG': round(predicted_price, 2),
                    'Lower_Bound': round(predicted_price - confidence_interval, 2),
                    'Upper_Bound': round(predicted_price + confidence_interval, 2),
                    'Historical_Avg': round(kharif_avg if season_info['season'] == 'Kharif' else rabi_avg, 2),
                    'Change_from_Avg': round(predicted_price - (kharif_avg if season_info['season'] == 'Kharif' else rabi_avg), 2)
                })

        predictions_df = pd.DataFrame(predictions)

        print("\n" + "="*80)
        print("SEASONAL PRICE PREDICTIONS")
        print("="*80)
        print(predictions_df.to_string(index=False))

        return predictions_df

    def save_models(self, filepath='crop.pkl'):
        """
        Save trained models and scalers
        """
        model_data = {
            'models': self.models,
            'scalers': self.scalers,
            'crop_encoders': self.crop_encoders,
            'feature_columns': self.feature_columns
        }
        joblib.dump(model_data, filepath)
        print(f"\n✓ Models saved to {filepath}")

    def load_models(self, filepath='crop.pkl'):
        """
        Load trained models and scalers
        """
        model_data = joblib.load(filepath)
        self.models = model_data['models']
        self.scalers = model_data['scalers']
        self.crop_encoders = model_data['crop_encoders']
        self.feature_columns = model_data['feature_columns']
        print(f"✓ Models loaded from {filepath}")


def main():
    """
    Main execution function
    """
    print("="*80)
    print("MULTI-CROP SEASONAL PRICE FORECASTING SYSTEM")
    print("="*80)

    # Load processed data
    input_files = [
        'rice_prices_processed.csv',
        'wheat_prices_processed.csv',
        'groundnut_prices_processed.csv','jowar_prices_processed.csv','onion_prices_processed.csv','potato_prices_processed.csv',
        'ragi_prices_processed.csv','soyabean_prices_processed.csv','tur_prices_processed.csv'
        # Add more crop files here
    ]

    # For demonstration, we'll use rice data
    # Replace this with actual loading of all 8 crops
    print("\nLoading data...")

    try:
        # Load all crop data and combine
        all_data = []
        for file in input_files:
            try:
                df = pd.read_csv(file)
                all_data.append(df)
                print(f"  ✓ Loaded {file}: {len(df)} records")
            except FileNotFoundError:
                print(f"  ⚠ File not found: {file}")
                continue

        if not all_data:
            print("\n⚠ No data files found. Using sample data...")
            # Load single file for demonstration
            df = pd.read_csv('rice_prices_processed.csv')
            all_data = [df]

        # Combine all crop data
        combined_data = pd.concat(all_data, ignore_index=True)

        # Ensure Date column is datetime
        if not pd.api.types.is_datetime64_any_dtype(combined_data['Date']):
            print("\nConverting Date column to datetime format...")
            try:
                combined_data['Date'] = pd.to_datetime(combined_data['Date'], format='%d/%m/%Y')
            except:
                try:
                    combined_data['Date'] = pd.to_datetime(combined_data['Date'], format='%Y-%m-%d')
                except:
                    combined_data['Date'] = pd.to_datetime(combined_data['Date'], infer_datetime_format=True)

        print(f"\nTotal combined data: {len(combined_data)} records")
        print(f"Crops in dataset: {combined_data['Commodity'].unique()}")
        print(f"Date range: {combined_data['Date'].min()} to {combined_data['Date'].max()}")

        # Initialize forecaster
        forecaster = CropPriceForecaster()

        # Train models - use unified model for better generalization
        print("\nChoose training approach:")
        print("1. Unified Model (ONE model for all crops - recommended for small datasets)")
        print("2. Separate Models (individual model per crop - needs more data)")

        # Default to unified model (better for small datasets)
        use_unified = True  # Change to False for separate models

        training_results = forecaster.train_models(combined_data, use_unified_model=use_unified)

        # Make predictions for next seasons
        predictions = forecaster.predict_seasonal_prices(combined_data, forecast_year=2026)

        # Save predictions to CSV
        predictions.to_csv('seasonal_price_predictions_2026_v3.csv', index=False)
        print("\n✓ Predictions saved to 'seasonal_price_predictions_2026.csv'")

        # Save models for future use
        forecaster.save_models('crop_price_models.pkl')

        # Generate summary report
        print("\n" + "="*80)
        print("PREDICTION SUMMARY BY SEASON")
        print("="*80)

        for season in ['Kharif', 'Rabi']:
            season_data = predictions[predictions['Season'] == season]
            print(f"\n{season} Season 2026:")
            print(f"  Average predicted price: ₹{season_data['Predicted_Price_1KG'].mean():.2f}")
            print(f"  Highest: {season_data.loc[season_data['Predicted_Price_1KG'].idxmax(), 'Crop']} - ₹{season_data['Predicted_Price_1KG'].max():.2f}")
            print(f"  Lowest: {season_data.loc[season_data['Predicted_Price_1KG'].idxmin(), 'Crop']} - ₹{season_data['Predicted_Price_1KG'].min():.2f}")

        print("\n" + "="*80)
        print("✓ FORECASTING COMPLETE!")
        print("="*80)

    except Exception as e:
        print(f"\n✗ Error: {str(e)}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()

MULTI-CROP SEASONAL PRICE FORECASTING SYSTEM

Loading data...
  ✓ Loaded rice_prices_processed.csv: 106 records
  ✓ Loaded wheat_prices_processed.csv: 52 records
  ✓ Loaded groundnut_prices_processed.csv: 51 records
  ✓ Loaded jowar_prices_processed.csv: 67 records
  ✓ Loaded onion_prices_processed.csv: 58 records
  ✓ Loaded potato_prices_processed.csv: 42 records
  ✓ Loaded ragi_prices_processed.csv: 55 records
  ✓ Loaded soyabean_prices_processed.csv: 25 records
  ✓ Loaded tur_prices_processed.csv: 53 records

Converting Date column to datetime format...

Total combined data: 509 records
Crops in dataset: ['Rice' 'Wheat' 'Groundnut' 'Jowar' 'Onion' 'Potato' 'Ragi' 'Soyabean'
 'Tur']
Date range: 2022-08-04 00:00:00 to 2025-10-23 00:00:00

Choose training approach:
1. Unified Model (ONE model for all crops - recommended for small datasets)
2. Separate Models (individual model per crop - needs more data)

TRAINING UNIFIED MODEL FOR ALL CROPS (Recommended for small datasets)
Preparing da