In [8]:
# -------------------------------
# 1️⃣ Install Required Libraries
# -------------------------------
!pip install scikit-learn==1.0.2 joblib==1.1.0 --quiet

# -------------------------------
# 2️⃣ Import Libraries
# -------------------------------
import pandas as pd
import numpy as np
import joblib
import tarfile
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sagemaker.sklearn.model import SKLearnModel
from sagemaker import get_execution_role
import boto3
import os

# -------------------------------
# 3️⃣ Initialize SageMaker Session
# -------------------------------
session = sagemaker.Session()
bucket = "mlpredictagri"    # <-- change if different
role = get_execution_role()

# -------------------------------
# 4️⃣ Load Dataset
# -------------------------------
df = pd.read_csv("final_real_dataset_complete.csv")

print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head())

# -------------------------------
# 5️⃣ Add Target Column
# -------------------------------
df['irrigation_needed'] = df.apply(
    lambda row: 1 if (row['Soil_Moisture'] < 15 and row['Ambient_Temperature'] > 30 and row['Rainfall'] < 5)
    else 0, axis=1
)

# -------------------------------
# 6️⃣ Features & Target
# -------------------------------
X = df[['Ambient_Temperature', 'Humidity', 'Soil_Moisture',
        'Light_Intensity', 'Rainfall', 'Annual CO₂ emissions (tonnes )']]
y = df['irrigation_needed']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------------
# 7️⃣ Train RandomForest with Class Weight
# -------------------------------
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(" Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Predicted counts:\n", pd.Series(y_pred).value_counts())

# -------------------------------
# 8️⃣ Save Model (named model.joblib)
# -------------------------------
joblib.dump(model, "model.joblib", protocol=2)
print(" Model saved as model.joblib")

# -------------------------------
# 9️⃣ Create predict.py for Inference
# -------------------------------
predict_code = '''
import joblib
import json
import os

def model_fn(model_dir):
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    return model

def input_fn(request_body, request_content_type):
    if request_content_type == "application/json":
        data = json.loads(request_body)
        # Handle both CamelCase and snake_case keys
        return {
            "Ambient_Temperature": data.get("Ambient_Temperature", data.get("ambient_temperature")),
            "Humidity": data.get("Humidity", data.get("humidity")),
            "Soil_Moisture": data.get("Soil_Moisture", data.get("soil_moisture")),
            "Light_Intensity": data.get("Light_Intensity", data.get("light_intensity")),
            "Rainfall": data.get("Rainfall", data.get("rainfall")),
            "Annual CO₂ emissions (tonnes )": data.get("Annual CO₂ emissions (tonnes )", data.get("annual_co2_ppm"))
        }
    raise ValueError(f"Unsupported content type: {request_content_type}")

def predict_fn(input_data, model):
    features = [
        input_data["Ambient_Temperature"],
        input_data["Humidity"],
        input_data["Soil_Moisture"],
        input_data["Light_Intensity"],
        input_data["Rainfall"],
        input_data["Annual CO₂ emissions (tonnes )"]
    ]
    prediction = model.predict([features])
    return {"irrigation_needed": int(prediction[0])}

def output_fn(prediction, content_type):
    return json.dumps(prediction)
'''

with open("predict.py", "w") as f:
    f.write(predict_code)

# -------------------------------
# 🔟 Package Model Artifacts
# -------------------------------
os.makedirs("model", exist_ok=True)
!cp model.joblib model/
!cp predict.py model/
!tar -czf model.tar.gz -C model .

# -------------------------------
# 1️⃣1️⃣ Upload to S3
# -------------------------------
s3 = boto3.client("s3")
s3.upload_file("model.tar.gz", bucket, "models/model_real.tar.gz")
s3_path = f"s3://{bucket}/models/model_real.tar.gz"
print(" Uploaded model to:", s3_path)

# -------------------------------
# 1️⃣2️⃣ Deploy Model to Endpoint
# -------------------------------
sk_model = SKLearnModel(
    model_data=s3_path,
    role=role,
    entry_point="predict.py",
    framework_version="1.0-1",
    py_version="py3"
)

predictor = sk_model.deploy(
    instance_type="ml.t2.medium",
    initial_instance_count=1
)

print(" Model deployed at endpoint:", predictor.endpoint_name)


Dataset Shape: (1200, 7)
Columns: ['Date', 'Ambient_Temperature', 'Humidity', 'Rainfall', 'Light_Intensity', 'Soil_Moisture', 'Annual CO₂ emissions (tonnes )']


Unnamed: 0,Date,Ambient_Temperature,Humidity,Rainfall,Light_Intensity,Soil_Moisture,Annual CO₂ emissions (tonnes )
0,2024-10-03,22.240245,55.291904,0.0,556.172805,27.521109,413092654.5
1,2024-10-03,21.706763,63.949181,0.0,596.136721,14.835566,413092654.5
2,2024-10-03,21.180946,67.837956,0.0,591.124627,17.086362,413092654.5
3,2024-10-04,22.593302,58.190811,9.6,241.412476,15.336156,413092654.5
4,2024-10-04,28.929001,63.772036,9.6,444.49383,39.822216,413092654.5


✅ Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       240

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240

Predicted counts:
 0    240
dtype: int64
✅ Model saved as model.joblib
✅ Uploaded model to: s3://mlpredictagri/models/model_real.tar.gz
--------------!✅ Model deployed at endpoint: sagemaker-scikit-learn-2025-08-10-17-59-37-428


In [9]:
import boto3
import json

# Create SageMaker runtime client
runtime = boto3.client("sagemaker-runtime")

# Replace with your actual endpoint name from deployment step
ENDPOINT_NAME = "sagemaker-scikit-learn-2025-08-10-17-59-37-428"  

# Create a sample payload
payload = {
    "Ambient_Temperature": 29.3,
    "Humidity": 57.1,
    "Soil_Moisture": 47.2,
    "Light_Intensity": 591.1,
    "Rainfall": 0,
    "Annual CO₂ emissions (tonnes )": 478.8
}

# Send request
response = runtime.invoke_endpoint(
    EndpointName=ENDPOINT_NAME,
    ContentType="application/json",
    Body=json.dumps(payload)
)

# Decode prediction
result = json.loads(response["Body"].read())
print("✅ Prediction Response:", result)


✅ Prediction Response: {'irrigation_needed': 0}


In [10]:
# ================================
# 1) Install exact runtime libs
# ================================
!pip install --quiet scikit-learn==1.0.2 joblib==1.1.0 sagemaker==2.* boto3==1.* pandas numpy

# ================================
# 2) Imports
# ================================
import os, json, tarfile, time, boto3
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.model import SKLearnModel

# ================================
# 3) Config
# ================================
REGION = boto3.Session().region_name or "us-east-1"
BUCKET = "mlpredictagri"            # <-- change if you use a different bucket
MODEL_KEY_PREFIX = "models"
ENTRY_POINT_NAME = "predict.py"
FRAMEWORK_VERSION = "1.0-1"         # sklearn 1.0 container
INSTANCE_TYPE = "ml.t2.medium"

sm_session = sagemaker.Session()
s3 = boto3.client("s3", region_name=REGION)

# Role (works in SageMaker Studio/Notebook)
try:
    ROLE = get_execution_role()
except Exception:
    # Fallback if running locally: put your IAM role ARN here
    ROLE = os.environ.get("SAGEMAKER_ROLE_ARN", "")

print("Region:", REGION)
print("Bucket:", BUCKET)
print("Role:", ROLE)

# ================================
# 4) Load dataset
# ================================
CSV_PATH = "final_real_dataset_complete.csv"  # must be in working directory
df = pd.read_csv(CSV_PATH)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head())

# Sanity: ensure columns exist (rename if your CSV uses different names)
required_cols = [
    "Ambient_Temperature",
    "Humidity",
    "Soil_Moisture",
    "Light_Intensity",
    "Rainfall",
    "Annual CO₂ emissions (tonnes )"
]
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Missing required column in CSV: {col}")

# ================================
# 5) Create target: irrigation_needed (rule-of-thumb)
#    You can replace with your true labels if you have them
# ================================
if "irrigation_needed" not in df.columns:
    df["irrigation_needed"] = df.apply(
        lambda row: 1 if (row["Soil_Moisture"] < 15 and row["Ambient_Temperature"] > 30 and row["Rainfall"] < 5) else 0,
        axis=1
    )

# ================================
# 6) Train / Test split
# ================================
FEATURES = [
    "Ambient_Temperature",
    "Humidity",
    "Soil_Moisture",
    "Light_Intensity",
    "Rainfall",
    "Annual CO₂ emissions (tonnes )"
]
TARGET = "irrigation_needed"

X = df[FEATURES].copy()
y = df[TARGET].astype(int).copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ================================
# 7) Train model
# ================================
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1,
)
model.fit(X_train, y_train)

# ================================
# 8) Evaluate
# ================================
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy: {acc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nPredicted counts:\n", pd.Series(y_pred).value_counts())

# ================================
# 9) Save model.joblib (top-level)
# ================================
MODEL_FILE = "model.joblib"
joblib.dump(model, MODEL_FILE)
print("✅ Saved:", MODEL_FILE)

# ================================
# 10) Create robust predict.py
#     Accepts CSV or JSON. 6 features in training order.
# ================================
predict_py = r'''
import json
import os
import joblib
import numpy as np

# 6 features order (as trained):
# [Ambient_Temperature, Humidity, Soil_Moisture, Light_Intensity, Rainfall, Annual CO2]
FEATURES = 6

def model_fn(model_dir):
    return joblib.load(os.path.join(model_dir, "model.joblib"))

def _to_float(x, default=0.0):
    try:
        return float(x)
    except Exception:
        return float(default)

def _map_from_dict(d):
    temp = d.get("Ambient_Temperature", d.get("ambient_temperature", d.get("temperature")))
    hum  = d.get("Humidity", d.get("humidity"))
    soil = d.get("Soil_Moisture", d.get("soil_moisture", d.get("soilMoisture")))
    light= d.get("Light_Intensity", d.get("light_intensity", d.get("light")))
    rain = d.get("Rainfall", d.get("rainfall", d.get("rain")))
    co2  = (d.get("Annual CO₂ emissions (tonnes )")
            or d.get("annual_co2_ppm")
            or d.get("annual_co2_emissions")
            or d.get("co2"))
    return [
        _to_float(temp, 0.0),
        _to_float(hum, 0.0),
        _to_float(soil, 0.0),
        _to_float(light, 0.0),
        _to_float(rain, 0.0),
        _to_float(co2, 0.0),
    ]

def input_fn(request_body, request_content_type):
    ct = (request_content_type or "").lower()

    if ct.startswith("text/csv"):
        parts = [p.strip() for p in request_body.replace("\n","").split(",") if p.strip() != ""]
        vals = [_to_float(p, 0.0) for p in parts]
        if len(vals) < FEATURES:
            raise ValueError(f"CSV expects {FEATURES} values; got {len(vals)}")
        vals = vals[:FEATURES]
        return np.array([vals], dtype=float)

    if ct.startswith("application/json"):
        data = json.loads(request_body)

        # {"instances":[[...]]} or {"inputs":[[...]]}
        if isinstance(data, dict):
            rows = data.get("instances") or data.get("inputs")
            if rows:
                row = rows[0]
                if isinstance(row, dict):
                    vals = _map_from_dict(row)
                else:
                    vals = [_to_float(v, 0.0) for v in list(row)]
                    if len(vals) < FEATURES:
                        raise ValueError(f"instances row must have {FEATURES} values")
                    vals = vals[:FEATURES]
                return np.array([vals], dtype=float)

            # flat dict
            vals = _map_from_dict(data)
            return np.array([vals], dtype=float)

        # list or list-of-lists
        if isinstance(data, list):
            row = data[0] if (data and isinstance(data[0], (list, dict))) else data
            if isinstance(row, dict):
                vals = _map_from_dict(row)
            else:
                vals = [_to_float(v, 0.0) for v in list(row)]
                if len(vals) < FEATURES:
                    raise ValueError(f"JSON list must have {FEATURES} values")
                vals = vals[:FEATURES]
            return np.array([vals], dtype=float)

    raise ValueError(f"Unsupported content type: {request_content_type}")

def predict_fn(input_data, model):
    pred = model.predict(input_data)
    return {"prediction": int(pred[0])}

def output_fn(prediction, content_type):
    return json.dumps(prediction)
'''
with open(ENTRY_POINT_NAME, "w") as f:
    f.write(predict_py)
print("✅ Wrote:", ENTRY_POINT_NAME)

# ================================
# 11) Package model for SageMaker
#     SKLearnModel expects model.tar.gz with model.joblib at root.
# ================================
TAR_NAME = "model.tar.gz"
with tarfile.open(TAR_NAME, "w:gz") as tar:
    tar.add(MODEL_FILE, arcname="model.joblib")
print("✅ Packed:", TAR_NAME)

# ================================
# 12) Upload model artifact to S3
# ================================
timestamp = time.strftime("%Y%m%d-%H%M%S")
s3_key = f"{MODEL_KEY_PREFIX}/model-{timestamp}.tar.gz"
s3.upload_file(TAR_NAME, BUCKET, s3_key)
model_data_s3 = f"s3://{BUCKET}/{s3_key}"
print("✅ Uploaded to:", model_data_s3)

# ================================
# 13) Deploy endpoint
# ================================
sk_model = SKLearnModel(
    model_data=model_data_s3,
    role=ROLE,
    entry_point=ENTRY_POINT_NAME,     # our robust I/O script
    framework_version=FRAMEWORK_VERSION,
    py_version="py3",
    sagemaker_session=sm_session,
)

predictor = sk_model.deploy(
    initial_instance_count=1,
    instance_type=INSTANCE_TYPE
)
endpoint_name = predictor.endpoint_name
print("✅ Deployed endpoint:", endpoint_name)




Region: us-east-1
Bucket: mlpredictagri
Role: arn:aws:iam::196936075057:role/service-role/AmazonSageMaker-ExecutionRole-20250728T224446
Dataset shape: (1200, 7)
Columns: ['Date', 'Ambient_Temperature', 'Humidity', 'Rainfall', 'Light_Intensity', 'Soil_Moisture', 'Annual CO₂ emissions (tonnes )']


Unnamed: 0,Date,Ambient_Temperature,Humidity,Rainfall,Light_Intensity,Soil_Moisture,Annual CO₂ emissions (tonnes )
0,2024-10-03,22.240245,55.291904,0.0,556.172805,27.521109,413092654.5
1,2024-10-03,21.706763,63.949181,0.0,596.136721,14.835566,413092654.5
2,2024-10-03,21.180946,67.837956,0.0,591.124627,17.086362,413092654.5
3,2024-10-04,22.593302,58.190811,9.6,241.412476,15.336156,413092654.5
4,2024-10-04,28.929001,63.772036,9.6,444.49383,39.822216,413092654.5


✅ Accuracy: 1.0000

Classification Report:
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       240

    accuracy                         1.0000       240
   macro avg     1.0000    1.0000    1.0000       240
weighted avg     1.0000    1.0000    1.0000       240


Confusion Matrix:
 [[240]]

Predicted counts:
 0    240
dtype: int64
✅ Saved: model.joblib
✅ Wrote: predict.py
✅ Packed: model.tar.gz
✅ Uploaded to: s3://mlpredictagri/models/model-20250810-182126.tar.gz
-------------!✅ Deployed endpoint: sagemaker-scikit-learn-2025-08-10-18-21-27-649


In [11]:
import boto3, json

smrt = boto3.client("sagemaker-runtime", region_name="us-east-1")
endpoint = "sagemaker-scikit-learn-2025-08-10-18-21-27-649"

json_payload = {
    "Ambient_Temperature": 29.3,
    "Humidity": 57.1,
    "Soil_Moisture": 47.2,
    "Light_Intensity": 591.1,
    "Rainfall": 0,
    "Annual CO₂ emissions (tonnes )": 478.8
}

response = smrt.invoke_endpoint(
    EndpointName=endpoint,
    ContentType="application/json",
    Accept="application/json",
    Body=json.dumps(json_payload)
)

print("JSON result:", response["Body"].read().decode())


JSON result: {"prediction": 0}


In [12]:
import boto3

smrt = boto3.client("sagemaker-runtime", region_name="us-east-1")
endpoint = "sagemaker-scikit-learn-2025-08-10-18-21-27-649"

csv_payload = "29.3,57.1,47.2,591.1,0,478.8\n"

response = smrt.invoke_endpoint(
    EndpointName=endpoint,
    ContentType="text/csv",
    Accept="application/json",
    Body=csv_payload
)

print("CSV result:", response["Body"].read().decode())


CSV result: {"prediction": 0}
