In [1]:
# === Imports ===
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# ANN: TensorFlow
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import MeanSquaredError

2025-07-19 00:18:10.584784: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-19 00:18:16.255219: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-19 00:18:18.834037: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752884303.469193    5535 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752884304.570964    5535 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752884313.218295    5535 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

In [3]:
# === Create directories ===
os.makedirs("../models/improve", exist_ok=True)

In [4]:
# === Load cleaned & log-transformed data ===
#df = pd.read_csv("../data/improve/crop_data_pivot_log.csv")

url = "https://huggingface.co/datasets/syazayacob/crop_data_pivot_log/resolve/main/crop_data_pivot_log.csv"
df = pd.read_csv(url)

In [5]:
# === Define target variables ===
targets = ["Production", "Area harvested", "Yield"]

In [6]:
# === Define ANN training function ===
def train_ann(X_train, y_train, X_test, y_test, model_path):
    input_layer = Input(shape=(X_train.shape[1],))
    x = Dense(64, activation='relu')(input_layer)
    x = Dropout(0.1)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(16, activation='relu')(x)
    x = Dropout(0.1)(x)
    output = Dense(1)(x)

    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer='adam', loss=MeanSquaredError())

    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_train, y_train,
              validation_split=0.2,
              epochs=50,
              batch_size=32,
              callbacks=[es],
              verbose=0)

    y_pred = model.predict(X_test).flatten()
    model.save(f"{model_path}.h5")
    return y_pred, model

In [7]:
# === Training loop for each target ===
results = []

for target in targets:
    print(f"\n📌 Training models to predict: {target}")

    # Features = other targets + 'Year'
    feature_cols = [col for col in targets if col != target] + ["Year"]

    # Drop rows with missing target/features
    data = df.dropna(subset=[target] + feature_cols).copy()
    X = data[feature_cols]
    y = data[target]

    # Scale
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # === Train ANN ===
    print("🔹 Training ANN...")
    y_pred_ann, model_ann = train_ann(X_train, y_train, X_test, y_test, f"../models/{target}_ANN")
    joblib.dump(scaler, f"../models/{target}_ANN_scaler.pkl", compress=3)

    mse_ann = mean_squared_error(y_test, y_pred_ann)
    r2_ann = r2_score(y_test, y_pred_ann)

    results.append({"Target": target, "Model": "ANN", "MSE": mse_ann, "R2": r2_ann})

    # === Train Random Forest ===
    print("🔹 Training Random Forest...")
    rf = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    joblib.dump({"model": rf, "scaler": scaler}, f"../models/{target}_RandomForest.pkl", compress=3)

    results.append({"Target": target, "Model": "RandomForest", "MSE": mean_squared_error(y_test, y_pred_rf), "R2": r2_score(y_test, y_pred_rf)})

    # === Train Linear Regression ===
    print("🔹 Training Linear Regression...")
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred_lr = lr.predict(X_test)
    joblib.dump({"model": lr, "scaler": scaler}, f"../models/{target}_LinearRegression.pkl")

    results.append({"Target": target, "Model": "LinearRegression", "MSE": mean_squared_error(y_test, y_pred_lr), "R2": r2_score(y_test, y_pred_lr)})

    # === Train XGBoost ===
    print("🔹 Training XGBoost...")
    xg = XGBRegressor(n_estimators=50, learning_rate=0.1, random_state=42)
    xg.fit(X_train, y_train)
    y_pred_xg = xg.predict(X_test)
    joblib.dump({"model": xg, "scaler": scaler}, f"../models/{target}_XGBoost.pkl")

    results.append({"Target": target, "Model": "XGBoost", "MSE": mean_squared_error(y_test, y_pred_xg), "R2": r2_score(y_test, y_pred_xg)})

print("\n✅ All models trained and saved.")


📌 Training models to predict: Production
🔹 Training ANN...


2025-07-19 00:18:52.358558: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m3599/3599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 692us/step




🔹 Training Random Forest...
🔹 Training Linear Regression...
🔹 Training XGBoost...

📌 Training models to predict: Area harvested
🔹 Training ANN...
[1m3599/3599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 690us/step




🔹 Training Random Forest...
🔹 Training Linear Regression...
🔹 Training XGBoost...

📌 Training models to predict: Yield
🔹 Training ANN...
[1m3599/3599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 649us/step




🔹 Training Random Forest...
🔹 Training Linear Regression...
🔹 Training XGBoost...

✅ All models trained and saved.


In [8]:
# === Summary table ===
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=["Target", "R2"], ascending=[True, False])
print("\n📊 Model Evaluation Results:")
print(results_df)


📊 Model Evaluation Results:
            Target             Model       MSE        R2
5   Area harvested      RandomForest  0.015408  0.998346
7   Area harvested           XGBoost  0.020844  0.997762
4   Area harvested               ANN  0.032210  0.996542
6   Area harvested  LinearRegression  0.042174  0.995473
1       Production      RandomForest  0.005827  0.999393
3       Production           XGBoost  0.010073  0.998951
2       Production  LinearRegression  0.041392  0.995688
0       Production               ANN  0.061595  0.993583
9            Yield      RandomForest  0.026414  0.986472
8            Yield               ANN  0.031133  0.984056
11           Yield           XGBoost  0.031472  0.983882
10           Yield  LinearRegression  0.042631  0.978167


In [9]:
from huggingface_hub import HfApi, upload_file, create_repo
import os
from dotenv import load_dotenv
load_dotenv()

import os
HF_TOKEN = os.getenv("HF_TOKEN")

create_repo("syazayacob/crop_models", repo_type="model", token=HF_TOKEN)

REPO_ID = "syazayacob/crop_models"

# Corrected create_repo call
api = HfApi()
try:
    api.create_repo(repo_id=REPO_ID, token=HF_TOKEN, repo_type="model", exist_ok=True)
    print(f"✅ Repository created or already exists: {REPO_ID}")
except Exception as e:
    print(f"❌ Failed to create repo: {e}")

# Adjust this path to match where your files are stored
model_folder = "../models"

model_files = [
    "Area harvested_ANN.h5",
    "Area harvested_ANN_scaler.pkl",
    "Area harvested_LinearRegression.pkl",
    "Area harvested_RandomForest.pkl",
    "Area harvested_XGBoost.pkl",
    "Production_ANN.h5",
    "Production_ANN_scaler.pkl",
    "Production_LinearRegression.pkl",
    "Production_RandomForest.pkl",
    "Production_XGBoost.pkl",
    "Yield_ANN.h5",
    "Yield_ANN_scaler.pkl",
    "Yield_LinearRegression.pkl",
    "Yield_RandomForest.pkl",
    "Yield_XGBoost.pkl"
]

# Upload with corrected file paths
for filename in model_files:
    full_path = os.path.join(model_folder, filename)
    if os.path.exists(full_path):
        try:
            upload_file(
                path_or_fileobj=full_path,
                path_in_repo=filename,
                repo_id=REPO_ID,
                repo_type="model",
                token=HF_TOKEN
            )
            print(f"✅ Uploaded: {filename}")
        except Exception as e:
            print(f"❌ Failed to upload {filename}: {e}")
    else:
        print(f"❌ File not found: {full_path}")

  from .autonotebook import tqdm as notebook_tqdm


✅ Repository created or already exists: syazayacob/crop_models
✅ Uploaded: Area harvested_ANN.h5
✅ Uploaded: Area harvested_ANN_scaler.pkl
✅ Uploaded: Area harvested_LinearRegression.pkl
✅ Uploaded: Area harvested_RandomForest.pkl
✅ Uploaded: Area harvested_XGBoost.pkl
✅ Uploaded: Production_ANN.h5
✅ Uploaded: Production_ANN_scaler.pkl
✅ Uploaded: Production_LinearRegression.pkl
✅ Uploaded: Production_RandomForest.pkl
✅ Uploaded: Production_XGBoost.pkl
✅ Uploaded: Yield_ANN.h5
✅ Uploaded: Yield_ANN_scaler.pkl
✅ Uploaded: Yield_LinearRegression.pkl
✅ Uploaded: Yield_RandomForest.pkl
✅ Uploaded: Yield_XGBoost.pkl


In [None]:
#