In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Statistical and Time Series Libraries
from scipy import stats
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.vector_ar.vecm import coint_johansen

# Visualization Libraries
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.offline as pyo

class SimplifiedEnhancedSoybeanAnalysis:
    """
    Simplified Enhanced Comprehensive Soybean Market Analysis System

    Features:
    1. Descriptive Statistics, Correlation & Regression
    2. Comprehensive Johansen Co-integration Test
    3. ARIMA/SARIMA with detailed AIC explanation
    4. Multiple ML Models: Logistic Regression, Random Forest, Linear Regression
    """

    def __init__(self):
        self.data = {}
        self.markets = ['Haveri', 'Kalagategi', 'Bidar', 'Kalaburgi', 'Bailhongal']
        self.results = {
            'descriptive_stats': {},
            'correlation_matrix': None,
            'regression_results': {},
            'cointegration_results': {},
            'cointegration_tables': {},
            'arima_models': {},
            'arima_explanations': {},
            'forecasts': {},
            'ml_models': {
                'logistic_regression': {},
                'random_forest': {},
                'linear_regression': {}
            },
            'model_comparisons': {},
            'regression_comparisons': {}  # Separate for regression metrics
        }

    def load_real_data(self):
        """Load real datasets from Excel files in /content/"""
        print("Loading real data from Excel files in /content/...")

        market_files = {
            'Haveri': '/content/haveri.xlsx',
            'Kalagategi': '/content/kalagategi.xlsx',
            'Bidar': '/content/Bidar.xlsx',
            'Kalaburgi': '/content/kalaburgi.xlsx',
            'Bailhongal': '/content/bailhongal.xlsx'
        }

        for market, file_path in market_files.items():
            try:
                # Read the specific sheet with header=1 (skipping title row)
                df = pd.read_excel(file_path, header=1, sheet_name='Agmarknet_Price_And_Arrival_Rep')

                # Filter to Soyabeen variety only
                df = df[df['Variety'] == 'Soyabeen'].copy()

                if len(df) == 0:
                    raise ValueError("No Soyabeen data found")

                # Handle date conversion: if numeric (Excel serial), convert; else assume already datetime
                if pd.api.types.is_numeric_dtype(df['Reported Date']):
                    df['Reported Date'] = pd.to_datetime(df['Reported Date'], origin='1899-12-30')
                else:
                    df['Reported Date'] = pd.to_datetime(df['Reported Date'])

                # Sort by date
                df = df.sort_values('Reported Date').reset_index(drop=True)

                self.data[market] = df
                print(f"✓ Loaded {market} data from {file_path}: {len(df)} Soyabeen records (date range: {df['Reported Date'].min().date()} to {df['Reported Date'].max().date()})")
            except FileNotFoundError:
                print(f"✗ File {file_path} not found - skipping {market}")
                self.data[market] = pd.DataFrame()
            except Exception as e:
                print(f"✗ Error loading {file_path}: {e} - skipping {market}")
                self.data[market] = pd.DataFrame()

    def objective_1_descriptive_analysis(self):
        """Enhanced descriptive statistics analysis"""
        print("\n" + "="*60)
        print("OBJECTIVE 1: ENHANCED DESCRIPTIVE STATISTICS")
        print("="*60)

        for market in self.markets:
            if market in self.data and len(self.data[market]) > 0:
                df = self.data[market]
                if 'Modal Price (Rs./Quintal)' not in df.columns or 'Arrivals (Tonnes)' not in df.columns:
                    print(f"Warning: Required columns missing in {market} - skipping stats.")
                    continue
                stats_data = {
                    'Market': market,
                    'Count': len(df),
                    'Mean_Price': df['Modal Price (Rs./Quintal)'].mean(),
                    'Std_Price': df['Modal Price (Rs./Quintal)'].std(),
                    'Min_Price': df['Modal Price (Rs./Quintal)'].min(),
                    'Max_Price': df['Modal Price (Rs./Quintal)'].max(),
                    'Mean_Arrivals': df['Arrivals (Tonnes)'].mean(),
                    'Std_Arrivals': df['Arrivals (Tonnes)'].std(),
                    'Min_Arrivals': df['Arrivals (Tonnes)'].min(),
                    'Max_Arrivals': df['Arrivals (Tonnes)'].max(),
                    'Median_Price': df['Modal Price (Rs./Quintal)'].median(),
                    'IQR_Price': df['Modal Price (Rs./Quintal)'].quantile(0.75) - df['Modal Price (Rs./Quintal)'].quantile(0.25),
                    'Skewness_Price': df['Modal Price (Rs./Quintal)'].skew(),
                    'Kurtosis_Price': df['Modal Price (Rs./Quintal)'].kurtosis(),
                    'CV_Price': (df['Modal Price (Rs./Quintal)'].std() / df['Modal Price (Rs./Quintal)'].mean()) * 100
                }

                self.results['descriptive_stats'][market] = stats_data

                print(f"\n{market} Market Summary:")
                print(f"  Records: {stats_data['Count']:,}")
                print(f"  Price (Rs/Qt): {stats_data['Mean_Price']:.2f} ± {stats_data['Std_Price']:.2f}")
                print(f"  CV: {stats_data['CV_Price']:.1f}%")
                print(f"  Skewness: {stats_data['Skewness_Price']:.3f}")
            else:
                print(f"\n{market}: No data loaded - skipping.")

        # Correlation analysis
        self.correlation_analysis()

    def correlation_analysis(self):
        """Perform correlation analysis"""
        print("\n" + "-"*40)
        print("CORRELATION ANALYSIS")
        print("-"*40)

        correlation_data = []

        for market in self.markets:
            if market in self.data and len(self.data[market]) > 0:
                df = self.data[market]
                if 'Modal Price (Rs./Quintal)' in df.columns and 'Arrivals (Tonnes)' in df.columns:
                    corr = df['Modal Price (Rs./Quintal)'].corr(df['Arrivals (Tonnes)'])
                    correlation_data.append({
                        'Market': market,
                        'Price_Arrivals_Correlation': corr
                    })
                    print(f"{market}: Price-Arrivals Correlation = {corr:.4f}")
                else:
                    print(f"{market}: Required columns missing - skipping correlation.")
            else:
                print(f"{market}: No data - skipping correlation.")

        self.results['correlation_data'] = correlation_data

    def objective_2_comprehensive_cointegration_analysis(self):
        """Comprehensive Johansen cointegration test"""
        print("\n" + "="*60)
        print("OBJECTIVE 2: COMPREHENSIVE JOHANSEN CO-INTEGRATION")
        print("="*60)

        # Prepare price data
        price_data = pd.DataFrame()
        loaded_markets = []

        for market in self.markets:
            if market in self.data and len(self.data[market]) > 0:
                df = self.data[market]
                if 'Modal Price (Rs./Quintal)' in df.columns:
                    monthly_data = df.set_index('Reported Date').resample('M')['Modal Price (Rs./Quintal)'].mean()
                    price_data[market] = monthly_data
                    loaded_markets.append(market)

        price_data = price_data.dropna()

        if len(price_data.columns) >= 2:
            try:
                johansen_result = coint_johansen(price_data.values, det_order=0, k_ar_diff=1)

                self.results['cointegration_results'] = {
                    'trace_stats': johansen_result.lr1,
                    'max_eigen_stats': johansen_result.lr2,
                    'critical_values_trace': johansen_result.cvt,
                    'critical_values_max_eigen': johansen_result.cvm,
                    'eigenvalues': johansen_result.eig,
                    'markets': list(price_data.columns)
                }

                # Create detailed tables
                self.create_cointegration_tables(johansen_result, price_data.columns)

                print("✓ Cointegration analysis completed")

            except Exception as e:
                print(f"Error in Johansen test: {e}")
        else:
            print(f"Insufficient data (loaded {len(loaded_markets)} markets, need at least 2) - skipping cointegration.")

    def create_cointegration_tables(self, johansen_result, markets):
        """Create detailed cointegration tables"""

        trace_table = []
        for i, (stat, cv) in enumerate(zip(johansen_result.lr1, johansen_result.cvt)):
            trace_table.append({
                'Null_Hypothesis': f'r ≤ {i}',
                'Alternative': f'r > {i}',
                'Trace_Statistic': stat,
                'Critical_Value_5': cv[1],
                'Result_5': 'Reject H0' if stat > cv[1] else 'Accept H0'
            })

        max_eigen_table = []
        for i, (stat, cv) in enumerate(zip(johansen_result.lr2, johansen_result.cvm)):
            max_eigen_table.append({
                'Null_Hypothesis': f'r = {i}',
                'Alternative': f'r = {i+1}',
                'Max_Eigen_Statistic': stat,
                'Critical_Value_5': cv[1],
                'Result_5': 'Reject H0' if stat > cv[1] else 'Accept H0'
            })

        summary_stats = {
            'Number_of_Variables': len(markets),
            'Markets_Analyzed': list(markets),
            'Eigenvalues': list(johansen_result.eig),
            'Number_of_Cointegrating_Relations': sum(1 for i, (stat, cv) in enumerate(zip(johansen_result.lr1, johansen_result.cvt)) if stat > cv[1])
        }

        self.results['cointegration_tables'] = {
            'trace_table': trace_table,
            'max_eigen_table': max_eigen_table,
            'summary_stats': summary_stats,
            'interpretation': self.generate_cointegration_interpretation(summary_stats['Number_of_Cointegrating_Relations'])
        }

    def generate_cointegration_interpretation(self, num_relations):
        """Generate interpretation of cointegration results"""

        if num_relations == 0:
            return {
                'conclusion': 'No Cointegration',
                'meaning': 'Markets operate independently',
                'implications': [
                    'Price shocks are market-specific',
                    'Arbitrage opportunities may exist',
                    'No error correction mechanism'
                ],
                'policy_implications': [
                    'Market-specific policies are effective',
                    'Transportation costs may be significant'
                ]
            }
        elif num_relations == 1:
            return {
                'conclusion': 'One Cointegrating Relationship',
                'meaning': 'Markets share one common equilibrium relationship',
                'implications': [
                    'Markets move together in the long run',
                    'Price shocks are eventually corrected',
                    'Limited arbitrage opportunities'
                ],
                'policy_implications': [
                    'Coordinated policy interventions are effective',
                    'Market integration is moderate to strong'
                ]
            }
        else:
            return {
                'conclusion': f'{num_relations} Cointegrating Relationships',
                'meaning': 'Markets are highly integrated',
                'implications': [
                    'Strong market integration',
                    'Rapid price transmission',
                    'Very limited arbitrage opportunities'
                ],
                'policy_implications': [
                    'Regional policy coordination is essential',
                    'High market efficiency'
                ]
            }

    def objective_3_enhanced_arima_forecasting(self):
        """Enhanced ARIMA analysis with AIC explanations"""
        print("\n" + "="*60)
        print("OBJECTIVE 3: ENHANCED ARIMA WITH AIC EXPLANATIONS")
        print("="*60)

        for market in self.markets:
            if market in self.data and len(self.data[market]) > 0:
                print(f"\n{'-'*40}")
                print(f"ARIMA MODELING FOR {market.upper()}")
                print(f"{'-'*40}")

                df = self.data[market]
                if 'Modal Price (Rs./Quintal)' not in df.columns:
                    print(f"Required column missing in {market} - skipping ARIMA.")
                    continue
                ts_data = df.set_index('Reported Date')['Modal Price (Rs./Quintal)'].resample('W').mean()
                ts_data = ts_data.dropna()

                if len(ts_data) < 20:
                    print(f"Insufficient data for {market} (<20 weeks) - skipping ARIMA.")
                    continue

                # Stationarity tests
                adf_result = adfuller(ts_data)
                d = 1 if adf_result[1] > 0.05 else 0

                # Model selection
                best_aic = float('inf')
                best_model = None
                best_params = None
                models_tested = []

                for p in range(0, 4):
                    for q in range(0, 4):
                        try:
                            model = ARIMA(ts_data, order=(p, d, q))
                            fitted_model = model.fit()

                            aic = fitted_model.aic
                            models_tested.append({
                                'order': (p, d, q),
                                'AIC': aic,
                                'BIC': fitted_model.bic,
                                'log_likelihood': fitted_model.llf,
                                'parameters': len(fitted_model.params)
                            })

                            if aic < best_aic:
                                best_aic = aic
                                best_model = fitted_model
                                best_params = (p, d, q)

                        except:
                            continue

                if best_model is not None:
                    # Generate forecasts
                    forecast_periods = 12
                    forecast = best_model.forecast(steps=forecast_periods)

                    self.results['arima_models'][market] = {
                        'model': best_model,
                        'best_params': best_params,
                        'aic': best_aic,
                        'bic': best_model.bic,
                        'forecast': forecast,
                        'models_tested': models_tested,
                        'log_likelihood': best_model.llf,
                        'parameters_count': len(best_model.params)
                    }

                    # Generate AIC explanation
                    explanation = self.generate_aic_explanation(best_params, best_aic, models_tested, market)
                    self.results['arima_explanations'][market] = explanation

                    print(f"✓ Best Model: ARIMA{best_params}, AIC: {best_aic:.2f}")
                else:
                    print(f"No valid ARIMA model fitted for {market}.")
            else:
                print(f"{market}: No data - skipping ARIMA.")

    def generate_aic_explanation(self, best_params, best_aic, models_tested, market):
        """Generate detailed AIC explanation"""

        p, d, q = best_params

        explanation = {
            'selected_model': f'ARIMA({p},{d},{q})',
            'aic_value': best_aic,
            'why_this_model': [
                f"AR(p={p}): {'Uses past {p} price values' if p > 0 else 'No autoregressive component'}",
                f"I(d={d}): {'Data differenced {d} time(s)' if d > 0 else 'Data is stationary'}",
                f"MA(q={q}): {'Uses past {q} forecast errors' if q > 0 else 'No moving average component'}"
            ],
            'model_complexity': 'Simple' if p+q <= 3 else 'Moderate' if p+q <= 5 else 'Complex',
            'total_models_tested': len(models_tested)
        }

        return explanation

    def enhanced_ml_analysis(self):
        """Enhanced ML analysis with Logistic Regression, Random Forest, and Linear Regression"""
        print("\n" + "="*60)
        print("ENHANCED ML ANALYSIS: LOGISTIC REGRESSION, RANDOM FOREST & LINEAR REGRESSION")
        print("="*60)

        for market in self.markets:
            if market in self.data and len(self.data[market]) > 0:
                print(f"\n{'-'*40}")
                print(f"ML MODELS FOR {market.upper()}")
                print(f"{'-'*40}")

                df = self.data[market].copy()

                if len(df) < 50:
                    print(f"Insufficient data for {market} (<50 records) - skipping ML.")
                    continue

                if 'Modal Price (Rs./Quintal)' not in df.columns or 'Arrivals (Tonnes)' not in df.columns:
                    print(f"Required columns missing in {market} - skipping ML.")
                    continue

                # Feature engineering (common for both classification and regression)
                df = df.sort_values('Reported Date')
                df['Price_Change'] = df['Modal Price (Rs./Quintal)'].pct_change()
                df['Price_Up'] = (df['Price_Change'] > 0).astype(int)

                # Create features
                df['Arrivals_Lag1'] = df['Arrivals (Tonnes)'].shift(1)
                df['Price_Lag1'] = df['Modal Price (Rs./Quintal)'].shift(1)
                df['Price_Lag2'] = df['Modal Price (Rs./Quintal)'].shift(2)
                df['Arrivals_MA3'] = df['Arrivals (Tonnes)'].rolling(window=3).mean()
                df['Price_MA3'] = df['Modal Price (Rs./Quintal)'].rolling(window=3).mean()
                df['Price_Volatility'] = df['Modal Price (Rs./Quintal)'].rolling(window=5).std()
                df['Month'] = df['Reported Date'].dt.month
                df['Quarter'] = df['Reported Date'].dt.quarter

                df_clean = df.dropna()

                if len(df_clean) < 30:
                    print(f"Insufficient clean data for {market} - skipping ML.")
                    continue

                # Prepare features
                feature_cols = ['Arrivals (Tonnes)', 'Arrivals_Lag1', 'Price_Lag1', 'Price_Lag2',
                               'Arrivals_MA3', 'Price_MA3', 'Price_Volatility', 'Month', 'Quarter']

                X = df_clean[feature_cols].copy()
                # For classification
                y_class = df_clean['Price_Up']
                # For regression
                y_reg = df_clean['Modal Price (Rs./Quintal)']

                # Split data (same split for both)
                X_train, X_test, y_train_class, y_test_class, y_train_reg, y_test_reg = train_test_split(
                    X, y_class, y_reg, test_size=0.3, random_state=42, stratify=y_class
                )

                # Scale features for models that need it
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_test_scaled = scaler.transform(X_test)

                # Logistic Regression (Classification)
                print("🔵 Logistic Regression (Classification)")
                log_reg = LogisticRegression(random_state=42, max_iter=1000)
                log_reg.fit(X_train_scaled, y_train_class)

                y_pred_log = log_reg.predict(X_test_scaled)
                log_accuracy = accuracy_score(y_test_class, y_pred_log)
                log_cv_scores = cross_val_score(log_reg, X_train_scaled, y_train_class, cv=5)

                self.results['ml_models']['logistic_regression'][market] = {
                    'model': log_reg,
                    'scaler': scaler,
                    'accuracy': log_accuracy,
                    'cv_mean': log_cv_scores.mean(),
                    'cv_std': log_cv_scores.std(),
                    'coefficients': log_reg.coef_[0],
                    'feature_names': feature_cols,
                    'task': 'classification'
                }

                print(f"  Accuracy: {log_accuracy:.4f}")

                # Random Forest (Classification)
                print("🌲 Random Forest (Classification)")
                rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
                rf_model.fit(X_train, y_train_class)

                y_pred_rf = rf_model.predict(X_test)
                rf_accuracy = accuracy_score(y_test_class, y_pred_rf)
                rf_cv_scores = cross_val_score(rf_model, X_train, y_train_class, cv=5)

                self.results['ml_models']['random_forest'][market] = {
                    'model': rf_model,
                    'accuracy': rf_accuracy,
                    'cv_mean': rf_cv_scores.mean(),
                    'cv_std': rf_cv_scores.std(),
                    'feature_importance': rf_model.feature_importances_,
                    'feature_names': feature_cols,
                    'task': 'classification'
                }

                print(f"  Accuracy: {rf_accuracy:.4f}")

                # Linear Regression (Regression)
                print("📈 Linear Regression (Regression)")
                lin_reg = LinearRegression()
                lin_reg.fit(X_train, y_train_reg)  # No scaling for linear reg, but could add if needed

                y_pred_lin = lin_reg.predict(X_test)
                lin_r2 = r2_score(y_test_reg, y_pred_lin)
                lin_cv_scores = cross_val_score(lin_reg, X_train, y_train_reg, cv=5, scoring='r2')

                self.results['ml_models']['linear_regression'][market] = {
                    'model': lin_reg,
                    'r2_score': lin_r2,
                    'cv_mean': lin_cv_scores.mean(),
                    'cv_std': lin_cv_scores.std(),
                    'coefficients': lin_reg.coef_,
                    'intercept': lin_reg.intercept_,
                    'feature_names': feature_cols,
                    'task': 'regression'
                }

                print(f"  R² Score: {lin_r2:.4f}")

                # Model comparison for classification
                class_comp = [
                    ('Logistic Regression', log_accuracy, log_cv_scores.mean()),
                    ('Random Forest', rf_accuracy, rf_cv_scores.mean())
                ]
                class_comp.sort(key=lambda x: x[1], reverse=True)

                self.results['model_comparisons'][market] = {
                    'ranking': class_comp,
                    'best_model': class_comp[0][0],
                    'best_accuracy': class_comp[0][1]
                }

                # Model comparison for regression (just linear for now)
                reg_comp = [
                    ('Linear Regression', lin_r2, lin_cv_scores.mean())
                ]

                self.results['regression_comparisons'][market] = {
                    'ranking': reg_comp,
                    'best_model': reg_comp[0][0],
                    'best_r2': reg_comp[0][1]
                }

                print(f"  Best Classification Model: {class_comp[0][0]} ({class_comp[0][1]:.4f})")
                print(f"  Best Regression Model: {reg_comp[0][0]} (R²: {reg_comp[0][1]:.4f})")
            else:
                print(f"{market}: No data - skipping ML.")

    def save_enhanced_results(self):
        """Save all results"""
        import pickle
        import json

        # Generate comprehensive report
        report = self.generate_comprehensive_report()
        with open('enhanced_soybean_analysis_report.txt', 'w') as f:
            f.write(report)

        # Save results as JSON
        json_results = {}
        for key, value in self.results.items():
            if key == 'arima_models':
                json_results[key] = {}
                for market, model_info in value.items():
                    json_results[key][market] = {
                        'best_params': model_info['best_params'],
                        'aic': model_info['aic'],
                        'bic': model_info['bic'],
                        'log_likelihood': model_info['log_likelihood'],
                        'parameters_count': model_info['parameters_count'],
                        'forecast': model_info['forecast'].tolist() if hasattr(model_info['forecast'], 'tolist') else [],
                        'models_tested': [
                            {
                                'order': m['order'],
                                'AIC': m['AIC'],
                                'BIC': m['BIC'],
                                'log_likelihood': m['log_likelihood'],
                                'parameters': m['parameters']
                            } for m in model_info.get('models_tested', [])
                        ]
                    }
            elif key == 'ml_models':
                json_results[key] = {}
                for model_type, markets in value.items():
                    json_results[key][model_type] = {}
                    for market, model_info in markets.items():
                        if model_info is not None:
                            json_results[key][model_type][market] = {
                                'feature_names': model_info['feature_names'],
                                'task': model_info['task']
                            }

                            if model_type == 'logistic_regression':
                                json_results[key][model_type][market].update({
                                    'accuracy': model_info['accuracy'],
                                    'cv_mean': model_info['cv_mean'],
                                    'cv_std': model_info['cv_std'],
                                    'coefficients': model_info['coefficients'].tolist()
                                })
                            elif model_type == 'random_forest':
                                json_results[key][model_type][market].update({
                                    'accuracy': model_info['accuracy'],
                                    'cv_mean': model_info['cv_mean'],
                                    'cv_std': model_info['cv_std'],
                                    'feature_importance': model_info['feature_importance'].tolist()
                                })
                            elif model_type == 'linear_regression':
                                json_results[key][model_type][market].update({
                                    'r2_score': model_info['r2_score'],
                                    'cv_mean': model_info['cv_mean'],
                                    'cv_std': model_info['cv_std'],
                                    'coefficients': model_info['coefficients'].tolist(),
                                    'intercept': model_info['intercept']
                                })
            else:
                json_results[key] = value

        # Add comparisons to JSON
        json_results['model_comparisons'] = self.results['model_comparisons']
        json_results['regression_comparisons'] = self.results['regression_comparisons']

        with open('enhanced_analysis_results.json', 'w') as f:
            json.dump(json_results, f, indent=2, default=str)

        # Save pickled results
        with open('enhanced_analysis_results.pkl', 'wb') as f:
            pickle.dump(self.results, f)

        print("✓ All enhanced results saved successfully")

    def generate_comprehensive_report(self):
        """Generate comprehensive report"""
        report = []
        report.append("="*80)
        report.append("ENHANCED SOYBEAN MARKET ANALYSIS REPORT")
        report.append("Multiple ML Models (Classification & Regression) and Comprehensive Cointegration Analysis")
        report.append("="*80)

        # Executive Summary
        report.append("\nEXECUTIVE SUMMARY")
        report.append("-" * 40)
        total_records = sum(len(self.data[market]) for market in self.data if len(self.data[market]) > 0)
        loaded_markets = [m for m in self.markets if len(self.data[m]) > 0]
        report.append(f"• Total Records: {total_records:,}")
        report.append(f"• Markets Loaded: {len(loaded_markets)} ({', '.join(loaded_markets)})")
        report.append(f"• ML Models: Logistic Regression, Random Forest (Classification); Linear Regression (Regression)")

        # Descriptive Statistics
        if self.results['descriptive_stats']:
            report.append("\n\nDESCRIPTIVE STATISTICS")
            report.append("-" * 40)

            for market, stats in self.results['descriptive_stats'].items():
                report.append(f"\n{market}:")
                report.append(f"  • Records: {stats['Count']:,}")
                report.append(f"  • Avg Price: ₹{stats['Mean_Price']:.2f}")
                report.append(f"  • Volatility (CV): {stats['CV_Price']:.1f}%")
                report.append(f"  • Skewness: {stats['Skewness_Price']:.3f}")

        # Cointegration Results
        if 'cointegration_tables' in self.results:
            coint_tables = self.results['cointegration_tables']
            summary = coint_tables['summary_stats']

            report.append("\n\nCOINTEGRATION ANALYSIS")
            report.append("-" * 40)
            report.append(f"• Markets: {', '.join(summary['Markets_Analyzed'])}")
            report.append(f"• Variables: {summary['Number_of_Variables']}")
            report.append(f"• Cointegrating Relations: {summary['Number_of_Cointegrating_Relations']}")

            interpretation = coint_tables['interpretation']
            report.append(f"• Conclusion: {interpretation['conclusion']}")
            report.append(f"• Economic Meaning: {interpretation['meaning']}")

            # Trace Table
            report.append("\nTrace Test Table:")
            report.append("Null Hypothesis | Alternative | Trace Statistic | Critical Value (5%) | Result")
            report.append("-" * 80)
            for row in coint_tables['trace_table']:
                report.append(f"{row['Null_Hypothesis']} | {row['Alternative']} | {row['Trace_Statistic']:.2f} | {row['Critical_Value_5']:.2f} | {row['Result_5']}")

        # ARIMA Results
        if self.results['arima_models']:
            report.append("\n\nARIMA FORECASTING MODELS")
            report.append("-" * 40)

            for market, model_info in self.results['arima_models'].items():
                report.append(f"\n{market}:")
                report.append(f"  • Model: ARIMA{model_info['best_params']}")
                report.append(f"  • AIC: {model_info['aic']:.2f}")
                report.append(f"  • Parameters: {model_info['parameters_count']}")

                if market in self.results['arima_explanations']:
                    explanation = self.results['arima_explanations'][market]
                    report.append(f"  • Complexity: {explanation['model_complexity']}")
                    report.append(f"  • Models Tested: {explanation['total_models_tested']}")

        # ML Results (Classification)
        if self.results['ml_models']:
            report.append("\n\nMACHINE LEARNING MODELS - CLASSIFICATION")
            report.append("-" * 40)

            for market in self.markets:
                if market in self.results['model_comparisons']:
                    comp = self.results['model_comparisons'][market]
                    report.append(f"\n{market}:")
                    report.append(f"  • Best Model: {comp['best_model']}")
                    report.append(f"  • Accuracy: {comp['best_accuracy']:.4f}")

                    for model_name, accuracy, cv_score in comp['ranking']:
                        report.append(f"    - {model_name}: {accuracy:.4f} (CV: {cv_score:.4f})")

        # ML Results (Regression)
        report.append("\n\nMACHINE LEARNING MODELS - REGRESSION")
        report.append("-" * 40)

        for market in self.markets:
            if market in self.results['regression_comparisons']:
                comp = self.results['regression_comparisons'][market]
                report.append(f"\n{market}:")
                report.append(f"  • Best Model: {comp['best_model']}")
                report.append(f"  • R² Score: {comp['best_r2']:.4f}")

                for model_name, r2, cv_score in comp['ranking']:
                    report.append(f"    - {model_name}: {r2:.4f} (CV: {cv_score:.4f})")

        # Recommendations
        report.append("\n\nKEY FINDINGS & RECOMMENDATIONS")
        report.append("-" * 40)

        if self.results['descriptive_stats']:
            best_market = max(self.results['descriptive_stats'].items(), key=lambda x: x[1]['Mean_Price'])[0]
            most_stable = min(self.results['descriptive_stats'].items(), key=lambda x: x[1]['CV_Price'])[0]

            report.append(f"• Highest Prices: {best_market}")
            report.append(f"• Most Stable Market: {most_stable}")

        if self.results['arima_models']:
            best_arima = min(self.results['arima_models'].items(), key=lambda x: x[1]['aic'])
            report.append(f"• Best ARIMA: {best_arima[0]} - ARIMA{best_arima[1]['best_params']} (AIC: {best_arima[1]['aic']:.2f})")

        if self.results['model_comparisons']:
            accuracies = [comp['best_accuracy'] for comp in self.results['model_comparisons'].values() if comp]
            if accuracies:
                avg_accuracy = np.mean(accuracies)
                report.append(f"• Average Classification Accuracy: {avg_accuracy:.1%}")

        if self.results['regression_comparisons']:
            r2s = [comp['best_r2'] for comp in self.results['regression_comparisons'].values() if comp]
            if r2s:
                avg_r2 = np.mean(r2s)
                report.append(f"• Average Regression R²: {avg_r2:.3f}")

        report.append("\nStrategic Recommendations:")
        report.append("• Use ARIMA for medium-term forecasting")
        report.append("• Apply classification ML models for daily direction predictions")
        report.append("• Use Linear Regression for price level predictions")
        report.append("• Consider market integration in strategies")
        report.append("• Implement risk management based on volatility")

        return "\n".join(report)

    def run_complete_analysis(self):
        """Run complete analysis pipeline"""
        print("🚀 Starting Enhanced Soybean Market Analysis")
        print("="*80)

        # Load real data
        self.load_real_data()

        # Check if any data loaded
        loaded_count = sum(1 for df in self.data.values() if len(df) > 0)
        if loaded_count == 0:
            print("❌ No data loaded! Please check file paths and formats.")
            return self.results

        # Run analyses
        self.objective_1_descriptive_analysis()
        self.objective_2_comprehensive_cointegration_analysis()
        self.objective_3_enhanced_arima_forecasting()
        self.enhanced_ml_analysis()

        # Save results
        self.save_enhanced_results()

        print("\n" + "="*80)
        print("✅ ENHANCED ANALYSIS COMPLETE!")
        print("="*80)
        print("\nGenerated Files:")
        print("• enhanced_soybean_analysis_report.txt")
        print("• enhanced_analysis_results.json")
        print("• enhanced_analysis_results.pkl")

        return self.results


def main():
    """Main execution function"""
    analyzer = SimplifiedEnhancedSoybeanAnalysis()
    results = analyzer.run_complete_analysis()
    return analyzer, results


if __name__ == "__main__":
    analyzer, results = main()

🚀 Starting Enhanced Soybean Market Analysis
Loading real data from Excel files in /content/...
✓ Loaded Haveri data from /content/haveri.xlsx: 841 Soyabeen records (date range: 2017-01-02 to 2025-05-28)
✓ Loaded Kalagategi data from /content/kalagategi.xlsx: 1137 Soyabeen records (date range: 2017-01-01 to 2025-06-30)
✓ Loaded Bidar data from /content/Bidar.xlsx: 1797 Soyabeen records (date range: 2017-01-02 to 2025-02-13)
✓ Loaded Kalaburgi data from /content/kalaburgi.xlsx: 1596 Soyabeen records (date range: 2017-01-02 to 2025-04-15)
✓ Loaded Bailhongal data from /content/bailhongal.xlsx: 504 Soyabeen records (date range: 2017-07-06 to 2025-03-01)

OBJECTIVE 1: ENHANCED DESCRIPTIVE STATISTICS

Haveri Market Summary:
  Records: 841
  Price (Rs/Qt): 4183.79 ± 1364.51
  CV: 32.6%
  Skewness: 1.031

Kalagategi Market Summary:
  Records: 1,137
  Price (Rs/Qt): 4195.18 ± 1399.86
  CV: 33.4%
  Skewness: 1.162

Bidar Market Summary:
  Records: 1,797
  Price (Rs/Qt): 4287.94 ± 1281.96
  CV: 2


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g

✓ Best Model: ARIMA(2, 1, 2), AIC: 4289.70

----------------------------------------
ARIMA MODELING FOR KALAGATEGI
----------------------------------------



A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g

✓ Best Model: ARIMA(2, 1, 2), AIC: 4259.05

----------------------------------------
ARIMA MODELING FOR BIDAR
----------------------------------------



A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g

✓ Best Model: ARIMA(2, 1, 2), AIC: 5376.28

----------------------------------------
ARIMA MODELING FOR KALABURGI
----------------------------------------



A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g

✓ Best Model: ARIMA(2, 1, 3), AIC: 5576.32

----------------------------------------
ARIMA MODELING FOR BAILHONGAL
----------------------------------------



A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g

✓ Best Model: ARIMA(3, 1, 3), AIC: 1817.48

ENHANCED ML ANALYSIS: LOGISTIC REGRESSION, RANDOM FOREST & LINEAR REGRESSION

----------------------------------------
ML MODELS FOR HAVERI
----------------------------------------
🔵 Logistic Regression (Classification)
  Accuracy: 0.7460
🌲 Random Forest (Classification)
  Accuracy: 0.6230
📈 Linear Regression (Regression)
  R² Score: 1.0000
  Best Classification Model: Logistic Regression (0.7460)
  Best Regression Model: Linear Regression (R²: 1.0000)

----------------------------------------
ML MODELS FOR KALAGATEGI
----------------------------------------
🔵 Logistic Regression (Classification)
  Accuracy: 0.6529
🌲 Random Forest (Classification)
  Accuracy: 0.5853
📈 Linear Regression (Regression)
  R² Score: 1.0000
  Best Classification Model: Logistic Regression (0.6529)
  Best Regression Model: Linear Regression (R²: 1.0000)

----------------------------------------
ML MODELS FOR BIDAR
----------------------------------------
🔵 Logistic R