In [None]:
# -------------------------------
# 1️⃣ Install Required Libraries
# -------------------------------
!pip install scikit-learn==1.0.2 joblib==1.1.0 --quiet

# -------------------------------
# 2️⃣ Import Libraries
# -------------------------------
import pandas as pd
import numpy as np
import joblib
import tarfile
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sagemaker.sklearn.model import SKLearnModel
from sagemaker import get_execution_role
import boto3

# -------------------------------
# 3️⃣ Initialize SageMaker Session
# -------------------------------
session = sagemaker.Session()
bucket = "mlpredictagri"    # Replace with your S3 bucket
role = get_execution_role()

# -------------------------------
# 4️⃣ Load Your Real Dataset
# -------------------------------
df = pd.read_csv("final_real_dataset_complete.csv")

# Display dataset info
print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()

# -------------------------------
# 5️⃣ Prepare Data
# -------------------------------
# Add target column 'irrigation_needed'
# Simple rule: If soil moisture < 15 AND temp > 30 AND rain < 5 => needs irrigation
df['irrigation_needed'] = df.apply(
    lambda row: 1 if (row['Soil_Moisture'] < 15 and row['Ambient_Temperature'] > 30 and row['Rainfall'] < 5)
    else 0, axis=1
)

# Features and target
X = df[['Ambient_Temperature', 'Humidity', 'Soil_Moisture', 
        'Light_Intensity', 'Rainfall', 'Annual CO₂ emissions (tonnes )']]
y = df['irrigation_needed']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------------
# -------------------------------
# 6️⃣ Train Model with class_weight
# -------------------------------
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'  # <-- This is the key fix
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("✅ Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Check how many irrigation=1 it predicted
print("Predicted counts:", pd.Series(y_pred).value_counts())


# -------------------------------
# 7️⃣ Save Model
# -------------------------------
joblib.dump(model, "irrigation_model.joblib", protocol=2)
print("✅ Model trained and saved")

# -------------------------------
# 8️⃣ Package Model (with predict.py)
# -------------------------------
!mkdir -p model
!cp irrigation_model.joblib model/

# Create a predict.py file for inference
with open("predict.py", "w") as f:
    f.write('''
import joblib
import json
import os

def model_fn(model_dir):
    model = joblib.load(os.path.join(model_dir, "irrigation_model.joblib"))
    return model

def input_fn(request_body, request_content_type):
    if request_content_type == "application/json":
        return json.loads(request_body)
    raise ValueError("Unsupported content type: " + request_content_type)

def predict_fn(input_data, model):
    data = [input_data['Ambient_Temperature'],
            input_data['Humidity'],
            input_data['Soil_Moisture'],
            input_data['Light_Intensity'],
            input_data['Rainfall'],
            input_data['Annual CO₂ emissions (tonnes )']]
    prediction = model.predict([data])
    return {"irrigation_needed": int(prediction[0])}

def output_fn(prediction, content_type):
    return json.dumps(prediction)
''')

!cp predict.py model/
!tar -czf model.tar.gz -C model .

# -------------------------------
# 9️⃣ Upload Model to S3
# -------------------------------
s3 = boto3.client('s3')
s3.upload_file("model.tar.gz", bucket, "models/model_real.tar.gz")
s3_path = f"s3://{bucket}/models/model_real.tar.gz"
print("✅ Model uploaded to:", s3_path)

# -------------------------------
# 🔟 Deploy Model to SageMaker Endpoint
# -------------------------------
model = SKLearnModel(
    model_data=s3_path,
    role=role,
    entry_point="predict.py",
    framework_version="1.0-1",
    py_version="py3"
)

predictor = model.deploy(
    instance_type="ml.t2.medium",
    initial_instance_count=1
)
print(" Model deployed successfully to SageMaker Endpoint!")


In [159]:
from sagemaker.predictor import Predictor
import json

# Use the deployed endpoint
endpoint_name = "sagemaker-scikit-learn-2025-08-07-15-40-15-026"
predictor = Predictor(endpoint_name=endpoint_name)

# Set the content type
predictor.content_type = "application/json"

# Example test input (replace with real values from your dataset range)
test_input = {
    "Ambient_Temperature": 35.0,
    "Humidity": 60.0,
    "Soil_Moisture": 12.0,
    "Light_Intensity": 700.0,
    "Rainfall": 2.0,
    "Annual CO₂ emissions (tonnes )": 4.0
}

# Invoke the endpoint
response = predictor.predict(json.dumps(test_input))
print("🔍 Prediction Response:", response.decode('utf-8'))


🔍 Prediction Response: {"irrigation_needed": 1}


In [160]:
print("✅ Accuracy:", accuracy)


✅ Accuracy: 1.0


In [161]:
print("Test Set Distribution:\n", y_test.value_counts())
print("Prediction Distribution:\n", pd.Series(y_pred).value_counts())


Test Set Distribution:
 irrigation_needed
0    221
1     19
Name: count, dtype: int64
Prediction Distribution:
 0    221
1     19
Name: count, dtype: int64


In [162]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))


[[221   0]
 [  0  19]]


In [163]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       221
           1       1.00      1.00      1.00        19

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240

