In [None]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import joblib
import os
from pathlib import Path


In [None]:
# Cell 2: Generate synthetic dataset
np.random.seed(42)

n_samples = 5000
data = []

crops = ['wheat', 'rice', 'maize', 'cotton', 'sugarcane', 'potato', 'tomato', 'onion']

for _ in range(n_samples):
    # Generate location (India coordinates)
    lat = np.random.uniform(8.0, 37.0)
    lon = np.random.uniform(68.0, 97.0)
    
    # Generate soil properties
    soil_ph = np.random.uniform(5.0, 8.5)
    soil_n = np.random.uniform(20.0, 100.0)
    soil_p = np.random.uniform(10.0, 60.0)
    soil_k = np.random.uniform(20.0, 80.0)
    
    # Generate field area
    area = np.random.uniform(0.5, 10.0)
    
    # Irrigation availability
    irrigation = np.random.choice([0, 1], p=[0.4, 0.6])
    
    # Determine best crop based on rules (simplified)
    if soil_ph >= 6.0 and soil_ph <= 7.5 and irrigation == 1:
        if soil_n > 60:
            crop = np.random.choice(['rice', 'wheat', 'maize'], p=[0.4, 0.3, 0.3])
        else:
            crop = np.random.choice(['cotton', 'sugarcane'], p=[0.6, 0.4])
    elif soil_ph < 6.0:
        crop = np.random.choice(['potato', 'tomato'], p=[0.5, 0.5])
    elif irrigation == 0:
        crop = np.random.choice(['wheat', 'cotton'], p=[0.6, 0.4])
    else:
        crop = np.random.choice(crops)
    
    data.append({
        'latitude': lat,
        'longitude': lon,
        'area': area,
        'soil_ph': soil_ph,
        'soil_n': soil_n,
        'soil_p': soil_p,
        'soil_k': soil_k,
        'irrigation_available': irrigation,
        'crop': crop
    })

df = pd.DataFrame(data)
print(f"Dataset shape: {df.shape}")
print(f"\nCrop distribution:\n{df['crop'].value_counts()}")


In [None]:
# Cell 3: Feature engineering
X = df[['latitude', 'longitude', 'area', 'soil_ph', 'soil_n', 'soil_p', 'soil_k', 'irrigation_available']]
y = df['crop']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


In [None]:
# Cell 4: Train XGBoost model
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric='mlogloss'
)

model.fit(
    X_train_scaled, y_train,
    eval_set=[(X_test_scaled, y_test)],
    verbose=False
)


In [None]:
# Cell 5: Evaluate model
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
# Cell 6: Save model and scaler
model_dir = Path("../models")
model_dir.mkdir(exist_ok=True)

joblib.dump(model, model_dir / "crop_recommender.pkl")
joblib.dump(scaler, model_dir / "scaler.pkl")

print(f"\nModel saved to {model_dir / 'crop_recommender.pkl'}")
print(f"Scaler saved to {model_dir / 'scaler.pkl'}")


In [None]:
# Cell 7: Test prediction
sample = np.array([[28.6, 77.2, 2.5, 7.0, 55.0, 35.0, 45.0, 1.0]])
sample_scaled = scaler.transform(sample)
prediction = model.predict(sample_scaled)
probabilities = model.predict_proba(sample_scaled)

print(f"\nSample prediction:")
print(f"Input: Latitude=28.6, Longitude=77.2, Area=2.5, pH=7.0, N=55, P=35, K=45, Irrigation=Yes")
print(f"Predicted crop: {prediction[0]}")
print(f"\nProbabilities:")
for i, crop in enumerate(model.classes_):
    print(f"  {crop}: {probabilities[0][i]:.4f}")


In [None]:
# Cell 8: Save dataset
df.to_csv(model_dir / "synthetic_dataset.csv", index=False)
print(f"\nDataset saved to {model_dir / 'synthetic_dataset.csv'}")
