# Model Training

In [1]:
import pandas as pd
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

def train_crop_recommendation_model():
    """
    Loads the farmer dataset, preprocesses the data, trains an XGBoost classifier,
    and evaluates its performance.
    """
    try:
        # 1. Load the Dataset
        print("Loading dataset...")
        # Use the correct path provided by the environment for uploaded files.
        df = pd.read_csv('/content/modelA_detailed_no_farmer_info.csv')
        print("Dataset loaded successfully.")
        print(f"Dataset shape: {df.shape}")
        print("-" * 30)

        # 2. Define Features (X) and Target (y)
        X = df.drop('recommended_crop', axis=1)
        y = df['recommended_crop']

        # 3. Encode the Target Variable
        # XGBoost requires the target variable to be integers (0, 1, 2, ...)
        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)

        # Store the mapping from encoded label to crop name for later use
        crop_names = label_encoder.classes_
        num_classes = len(crop_names)
        print(f"Target variable 'recommended_crop' encoded into {num_classes} classes.")
        print("-" * 30)

        # 4. Identify Categorical and Numerical Features
        categorical_features = X.select_dtypes(include=['object']).columns
        numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

        print("Identified Feature Types:")
        print(f"Categorical features: {list(categorical_features)}")
        print(f"Numerical features: {list(numerical_features)}")
        print("-" * 30)

        # 5. Create a Preprocessing Pipeline for Features
        # We use a ColumnTransformer to apply different transformations to different columns.
        # - OneHotEncoder for categorical features to convert them into a numerical format.
        # - 'passthrough' for numerical features to leave them as they are.
        preprocessor = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
            ],
            remainder='passthrough'
        )

        # 6. Split Data into Training and Testing Sets
        print("Splitting data into training and testing sets (80/20 split)...")
        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
        print(f"Training set size: {X_train.shape[0]} samples")
        print(f"Testing set size: {X_test.shape[0]} samples")
        print("-" * 30)

        # 7. Define the XGBoost Model and create the full pipeline
        # The pipeline will first preprocess the data and then feed it to the classifier.
        model = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', xgb.XGBClassifier(
                objective='multi:softmax',  # Objective for multi-class classification
                num_class=num_classes,      # Specify the number of unique crops
                eval_metric='mlogloss',     # Logarithmic loss metric for evaluation
                use_label_encoder=False,    # Suppress deprecation warning
                random_state=42
            ))
        ])

        # 8. Train the Model
        print("Training the XGBoost model... This may take a moment.")
        model.fit(X_train, y_train)
        print("Model training completed.")
        print("-" * 30)

        # 9. Evaluate the Model
        print("Evaluating the model on the test set...")
        y_pred = model.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"\nModel Accuracy: {accuracy:.4f} ({accuracy:.2%})")

        # Display a detailed classification report
        print("\nClassification Report:")
        # We use the original string labels for better readability in the report
        report = classification_report(
            label_encoder.inverse_transform(y_test),
            label_encoder.inverse_transform(y_pred)
        )
        print(report)

        # 10. Save the Trained Model and Label Encoder
        print("-" * 30)
        print("Saving the trained model and label encoder...")
        model_filename = 'crop_recommendation_model.joblib'
        label_encoder_filename = 'crop_label_encoder.joblib'

        joblib.dump(model, model_filename)
        joblib.dump(label_encoder, label_encoder_filename)

        print(f"Model saved successfully to '{model_filename}'")
        print(f"Label encoder saved successfully to '{label_encoder_filename}'")
        print("-" * 30)

    except FileNotFoundError:
        print("Error: The file 'modelA_detailed_no_farmer_info.csv' was not found.")
        print("Please ensure the file is uploaded correctly before running the script.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == '__main__':
    train_crop_recommendation_model()



Loading dataset...
Dataset loaded successfully.
Dataset shape: (6328, 14)
------------------------------
Target variable 'recommended_crop' encoded into 16 classes.
------------------------------
Identified Feature Types:
Categorical features: ['fertilizer_affordability', 'loan_access', 'soil_type', 'region', 'irrigation', 'financial_goal']
Numerical features: ['farming_experience', 'soil_ph', 'rainfall', 'temperature', 'market_distance', 'income', 'land_size']
------------------------------
Splitting data into training and testing sets (80/20 split)...
Training set size: 5062 samples
Testing set size: 1266 samples
------------------------------
Training the XGBoost model... This may take a moment.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Model training completed.
------------------------------
Evaluating the model on the test set...

Model Accuracy: 0.1130 (11.30%)

Classification Report:
              precision    recall  f1-score   support

      coffee       0.00      0.00      0.00        27
      cotton       0.14      0.20      0.17       145
   groundnut       0.08      0.07      0.08        95
        jute       0.26      0.14      0.18        50
       maize       0.13      0.13      0.13        55
     millets       0.16      0.16      0.16       122
      pulses       0.11      0.10      0.11       125
        rice       0.10      0.10      0.10        97
     sorghum       0.07      0.05      0.06        41
     soybean       0.13      0.13      0.13        86
      spices       0.07      0.05      0.06        56
   sugarcane       0.09      0.12      0.11       139
         tea       0.19      0.13      0.16        52
  vegetables       0.02      0.01      0.02        67
  watermelon       0.00      0.00  