In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import torch # Imported as TabNet is built on PyTorch

# --- 1. Load the Dataset ---
# Assuming the file "Dataset 1.csv" is in the same directory as this script.
try:
    df = pd.read_csv("Dataset 1.csv")
    print("Dataset loaded successfully.")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())
except FileNotFoundError:
    print("Error: 'Dataset 1.csv' not found. Please ensure the file is in the correct path.")
    exit()

# --- 2. Separate Features (X) and Target (y) ---
# The last column, 'Crop', is our target variable.
X = df.drop('Crop', axis=1)
y = df['Crop']

# Check for missing values in features and handle them
if X.isnull().sum().any():
    print("\nWarning: Missing values detected. Filling with median for simplicity.")
    X = X.fillna(X.median())

# --- 3. Preprocessing the Target Variable (Label Encoding) ---
# TabNet, like XGBoost, uses integer encoding for multi-class targets.
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
class_names = label_encoder.classes_

print(f"\nOriginal classes: {class_names}")
print(f"Total number of features: {X.shape[1]}")
print(f"Total number of samples: {X.shape[0]}")

# --- 4. Split Data into Training and Testing Sets ---
# We use 80% for training and 20% for testing, with a fixed random_state for reproducibility.
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# --- 5. Feature Scaling (Crucial for Neural Networks like TabNet) ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

# Convert data to NumPy arrays with float32 type for TabNet
X_train = X_train_scaled.astype(np.float32)
X_test = X_test_scaled.astype(np.float32)
y_train = y_train.astype(np.int64)
y_test = y_test.astype(np.int64)

print(f"\nTraining set size: {X_train.shape[0]} samples (Scaled and converted to float32)")
print(f"Test set size: {X_test.shape[0]} samples (Scaled and converted to float32)")


# --- 6. Initialize and Train the TabNet Model ---
# TabNetClassifier is used for classification tasks.
# We use a small validation set from the training data for early stopping.
model = TabNetClassifier(
    n_steps=3,             # Number of steps in the architecture (boosted trees analogy)
    gamma=1.5,             # Multiplicative factor for attention mechanism
    n_d=8,                 # Dimension of the prediction layer (width)
    n_a=8,                 # Dimension of the attention layer (width)
    seed=42,
    verbose=0              # Set to 1 to see epoch-by-epoch training logs
)

# Use a small validation set (10%) from the training data for TabNet's internal early stopping
X_train_fit, X_val, y_train_fit, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
)

print("\nStarting TabNet model training with early stopping...")
model.fit(
    X_train_fit, y_train_fit,
    eval_set=[(X_val, y_val)],
    eval_metric=['accuracy'],
    max_epochs=100,             # Set a reasonable number of epochs
    patience=10,                # Stop training if validation metric does not improve after 10 epochs
    batch_size=1024,
    virtual_batch_size=128
)
print("Model training complete.")

# --- 7. Model Prediction and Evaluation ---

# Predict on the test set. TabNet's predict returns the class index.
y_pred_encoded = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_encoded)
print(f"\n--- Model Evaluation Results ---")
print(f"Accuracy on Test Set: {accuracy * 100:.2f}%")

# Generate a detailed classification report
report = classification_report(
    y_test,
    y_pred_encoded,
    target_names=class_names, # Use the original class names for readability
    zero_division=0
)

print("\nClassification Report:")
print(report)

# --- Example: Making a Single Prediction ---
# You can now use the model to predict the crop for a new set of data
# Note: Prediction must be made on the SCALED data
sample_data = X_test[0, :]
sample_data_input = np.array([sample_data]).astype(np.float32)

single_prediction_encoded = model.predict(sample_data_input)[0]
single_prediction_label = label_encoder.inverse_transform([single_prediction_encoded])[0]
true_label = label_encoder.inverse_transform([y_test[0]])[0]

print("\n--- Single Prediction Example ---")
# To show the original (unscaled) input values, we use X_test_raw
print(f"Input Features (Unscaled):\n{X_test_raw.iloc[0].to_dict()}")
print(f"Predicted Crop: {single_prediction_label}")
print(f"True Crop (for comparison): {true_label}")


Dataset loaded successfully.

First 5 rows of the dataset:
   Temperature   Humidity    Rainfall        PH  Nitrogen  Phosphorous  \
0    20.879744  82.002744  202.935536  6.502985     69.30     79.50000   
1    21.770462  80.319644  226.655537  7.038096     72.02    141.82400   
2    23.004459  82.320763  263.964248  7.633568     77.77     59.39000   
3    26.491096  80.283629  242.864034  6.980401     78.65    147.45895   
4    20.280071  81.604873  262.717340  7.628473     73.98     68.95000   

   Potassium  Crop  
0    94.4400  rice  
1   141.6978  rice  
2    81.8900  rice  
3   142.9430  rice  
4    95.7400  rice  

Original classes: ['Adzuki Beans' 'Black gram' 'Chickpea' 'Coconut' 'Coffee' 'Cotton'
 'Ground Nut' 'Jute' 'Kidney Beans' 'Lentil' 'Moth Beans' 'Mung Bean'
 'Peas' 'Pigeon Peas' 'Rubber' 'Sugarcane' 'Tea' 'Tobacco' 'apple'
 'banana' 'grapes' 'maize' 'mango' 'millet' 'muskmelon' 'orange' 'papaya'
 'pomegranate' 'rice' 'watermelon' 'wheat']
Total number of features: 7




In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier # Swapped XGBoost for CatBoost
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# --- 1. Load the Dataset ---
# Assuming the file "Dataset 1.csv" is in the same directory as this script.
try:
    df = pd.read_csv("Dataset 1.csv")
    print("Dataset loaded successfully.")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())
except FileNotFoundError:
    print("Error: 'Dataset 1.csv' not found. Please ensure the file is in the correct path.")
    exit()

# --- 2. Separate Features (X) and Target (y) ---
# The last column, 'Crop', is our target variable.
X = df.drop('Crop', axis=1)
y = df['Crop']

# Check for missing values in features and handle them
if X.isnull().sum().any():
    print("\nWarning: Missing values detected. Filling with median for simplicity.")
    X = X.fillna(X.median())

# --- 3. Preprocessing the Target Variable (Label Encoding) ---
# CatBoost, like other multi-class classifiers, works best with integer encoded targets.
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
class_names = label_encoder.classes_

print(f"\nOriginal classes: {class_names}")
print(f"Total number of features: {X.shape[1]}")
print(f"Total number of samples: {X.shape[0]}")

# --- 4. Split Data into Training and Testing Sets ---
# We use 80% for training and 20% for testing, with a fixed random_state for reproducibility.
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# --- 5. Initialize and Train the CatBoost Model ---
# CatBoostClassifier is used for classification tasks.
model = CatBoostClassifier(
    iterations=100,             # Equivalent to n_estimators in XGBoost
    learning_rate=0.1,
    loss_function='MultiClass', # Required for multi-class problems
    random_seed=42,
    verbose=0,                  # Suppress training output for cleaner execution
    allow_writing_files=False   # Prevent CatBoost from writing auxiliary files
)

print("\nStarting CatBoost model training...")
# Note: CatBoost does not require explicit feature scaling.
model.fit(X_train, y_train)
print("Model training complete.")

# --- 6. Model Prediction and Evaluation ---

# Predict on the test set
y_pred_encoded = model.predict(X_test)
# CatBoost's predict() returns a 2D array of predictions; we flatten it to 1D
y_pred_encoded = y_pred_encoded.flatten()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_encoded)
print(f"\n--- Model Evaluation Results ---")
print(f"Accuracy on Test Set: {accuracy * 100:.2f}%")

# Generate a detailed classification report
report = classification_report(
    y_test,
    y_pred_encoded,
    target_names=class_names, # Use the original class names for readability
    zero_division=0
)

print("\nClassification Report:")
print(report)

# --- Example: Making a Single Prediction ---
# You can now use the model to predict the crop for a new set of data
sample_data = X_test.iloc[0] # Taking the first sample from the test set
sample_data_df = pd.DataFrame([sample_data], columns=X.columns)

single_prediction_encoded = model.predict(sample_data_df)[0][0] # CatBoost predict outputs [[value]]
single_prediction_label = label_encoder.inverse_transform([single_prediction_encoded])[0]
true_label = label_encoder.inverse_transform([y_test[0]])[0]

print("\n--- Single Prediction Example ---")
print(f"Input Features:\n{sample_data.to_dict()}")
print(f"Predicted Crop: {single_prediction_label}")
print(f"True Crop (for comparison): {true_label}")

# --- Feature Importance Visualization (Text-based) ---
# This shows which features were most influential in the model's decision-making.
feature_importances = model.get_feature_importance()
feature_names = X.columns
sorted_idx = np.argsort(feature_importances)[::-1]

print("\n--- Feature Importances (Top 5) ---")
for i in range(min(5, len(feature_names))):
    feature = feature_names[sorted_idx[i]]
    importance = feature_importances[sorted_idx[i]]
    print(f"  {i+1}. {feature}: {importance:.4f}")


ModuleNotFoundError: No module named 'catboost'

In [None]:
pip install catboost