In [21]:
!pip install xgboost



In [22]:
!pip install tensorflow



In [23]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [24]:
df = pd.read_csv("../data/processed/crop_data_pivot.csv")

# Ensure output directory exists
os.makedirs("../models", exist_ok=True)

# Define target columns (each "Element")
elements = ["Production", "Area harvested", "Yield"]

# Define features to use — here we only use 'Year'
#feature_cols = ["Year"]

In [25]:
print(df.columns.tolist())

['Area', 'Item', 'Year', 'Area harvested', 'Production', 'Yield']


Model Definitions

In [26]:
"""
def get_models():
    return {
        "LinearRegression": LinearRegression(),
        "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
        "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
        "ANN": Sequential([
            Dense(64, input_dim=len(feature_cols), activation='relu'),
            Dense(32, activation='relu'),
            Dense(1)
        ])
    }
    """

'\ndef get_models():\n    return {\n        "LinearRegression": LinearRegression(),\n        "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),\n        "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),\n        "ANN": Sequential([\n            Dense(64, input_dim=len(feature_cols), activation=\'relu\'),\n            Dense(32, activation=\'relu\'),\n            Dense(1)\n        ])\n    }\n    '

In [27]:
# Helper function: train ANN
""""
def train_ann(X_train, y_train, X_test, y_test, model_path):
    model = Sequential()
    model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss=MeanSquaredError())
    model.fit(X_train, y_train, epochs=50, batch_size=64, verbose=0)
    y_pred = model.predict(X_test).flatten()
    model.save(f"{model_path}.h5")
    return y_pred, model
    """

'"\ndef train_ann(X_train, y_train, X_test, y_test, model_path):\n    model = Sequential()\n    model.add(Dense(32, input_dim=X_train.shape[1], activation=\'relu\'))\n    model.add(Dense(16, activation=\'relu\'))\n    model.add(Dense(1))\n    model.compile(optimizer=\'adam\', loss=MeanSquaredError())\n    model.fit(X_train, y_train, epochs=50, batch_size=64, verbose=0)\n    y_pred = model.predict(X_test).flatten()\n    model.save(f"{model_path}.h5")\n    return y_pred, model\n    '

In [28]:

#from tensorflow.keras.callbacks import EarlyStopping
#from tensorflow.keras.layers import Dropout, LeakyReLU

#from sklearn.preprocessing import PolynomialFeatures
""""
def train_ann(X_train, y_train, X_test, y_test, model_path):
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.1))

    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.1))

    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.1))

    model.add(Dense(1))
    
    model.compile(optimizer='adam', loss=MeanSquaredError())
    
    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_train, y_train, 
              validation_split=0.2, 
              epochs=50, 
              batch_size=64, 
              callbacks=[es], 
              verbose=0)
    
    y_pred = model.predict(X_test).flatten()
    model.save(f"{model_path}.h5")
    return y_pred, model
    """

'"\ndef train_ann(X_train, y_train, X_test, y_test, model_path):\n    model = Sequential()\n    model.add(Dense(64, input_dim=X_train.shape[1], activation=\'relu\'))\n    model.add(Dropout(0.1))\n\n    model.add(Dense(32, activation=\'relu\'))\n    model.add(Dropout(0.1))\n\n    model.add(Dense(16, activation=\'relu\'))\n    model.add(Dropout(0.1))\n\n    model.add(Dense(1))\n    \n    model.compile(optimizer=\'adam\', loss=MeanSquaredError())\n    \n    es = EarlyStopping(monitor=\'val_loss\', patience=10, restore_best_weights=True)\n\n    model.fit(X_train, y_train, \n              validation_split=0.2, \n              epochs=50, \n              batch_size=64, \n              callbacks=[es], \n              verbose=0)\n    \n    y_pred = model.predict(X_test).flatten()\n    model.save(f"{model_path}.h5")\n    return y_pred, model\n    '

In [29]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import MeanSquaredError

def train_ann(X_train, y_train, X_test, y_test, model_path):
    input_layer = Input(shape=(X_train.shape[1],))
    x = Dense(64, activation='relu')(input_layer)
    x = Dropout(0.1)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(16, activation='relu')(x)
    x = Dropout(0.1)(x)
    output = Dense(1)(x)

    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer='adam', loss=MeanSquaredError())

    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_train, y_train,
              validation_split=0.2,
              epochs=50,
              batch_size=64,
              callbacks=[es],
              verbose=0)

    y_pred = model.predict(X_test).flatten()
    model.save(f"{model_path}.h5")  # ✅ Save in .h5 format
    return y_pred, model

Training Loop

In [None]:
# Loop each element
for element in elements:
    print(f"\n📌 Training models for: {element}")

    # Define features dynamically
    feature_cols = [col for col in elements if col != element] + ["Year"]
    
    # Drop rows where *either* the target OR any feature is missing
    data = df.dropna(subset=[element] + feature_cols).copy()
    X = data[feature_cols]
    y = data[element]

    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )

    # === Train ANN ===
    print("🔹 Training ANN...")
    y_pred_ann, model_ann = train_ann(X_train, y_train, X_test, y_test, f"models/{element}_ANN")
    joblib.dump(scaler, f"models/{element}_ANN_scaler.pkl", compress=3)

    # === Train Random Forest ===
    print("🔹 Training Random Forest...")
    rf = RandomForestRegressor(n_estimators=50, random_state=42)
    rf.fit(X_train, y_train)
    joblib.dump({"model": rf, "scaler": scaler}, f"models/{element}_RandomForest.pkl", compress=3)

    # === Train Linear Regression ===
    print("🔹 Training Linear Regression...")
    #poly = PolynomialFeatures(degree=2, include_bias=False)
    #X_poly = poly.fit_transform(X_scaled)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    joblib.dump({"model": lr, "scaler": scaler}, f"models/{element}_LinearRegression.pkl")

    # === Train XGBoost ===
    print("🔹 Training XGBoost...")
    xg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
    """
    xg = xgb.XGBRegressor(
        objective="reg:squarederror",
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    """
    xg.fit(X_train, y_train)
    joblib.dump({"model": xg, "scaler": scaler}, f"models/{element}_XGBoost.pkl")

    print("✅ All models for", element, "saved.")


📌 Training models for: Production
🔹 Training ANN...
[1m3599/3599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 474us/step




🔹 Training Random Forest...
🔹 Training Linear Regression...
🔹 Training XGBoost...
✅ All models for Production saved.

📌 Training models for: Area harvested
🔹 Training ANN...
[1m3599/3599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 502us/step




🔹 Training Random Forest...
🔹 Training Linear Regression...
🔹 Training XGBoost...
✅ All models for Area harvested saved.

📌 Training models for: Yield
🔹 Training ANN...
[1m3599/3599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 480us/step




🔹 Training Random Forest...
🔹 Training Linear Regression...
🔹 Training XGBoost...
✅ All models for Yield saved.
