<a href="https://colab.research.google.com/github/vinileodido/MVP_PucRio_ML/blob/main/IoT_Agriculture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import numpy as np


df_iot = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "wisam1985/iot-agriculture-2024",
    "IoTProcessed_Data.csv"
)

display(df_iot.head())

In [None]:
df = df_iot

# Drop rows with missing values, as they represent a small fraction of the data
df.dropna(inplace=True)

# Define features (X) and target (y)
# The problem is defined as predicting if the fan is on or off
# Features will be the environmental sensors and nutrient levels
features = ['tempreature', 'humidity', 'water_level', 'N', 'P', 'K']
target = 'Fan_actuator_ON'

X = df[features]
y = df[target]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a machine learning pipeline for Logistic Regression
# The pipeline first standardizes the features and then fits the model
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42))
])

# Create a machine learning pipeline for Random Forest Classifier
pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Perform cross-validation to evaluate the models
# Using a 5-fold cross-validation
cv_scores_lr = cross_val_score(pipeline_lr, X_train, y_train, cv=5)
cv_scores_rf = cross_val_score(pipeline_rf, X_train, y_train, cv=5)

print("Resultados da Validação Cruzada (Acurácia):")
print("-" * 50)
print(f"Pipeline de Regressão Logística: {np.mean(cv_scores_lr):.4f} (Média) ± {np.std(cv_scores_lr):.4f} (Desvio Padrão)")
print(f"Scores por fold: {cv_scores_lr}")
print("-" * 50)
print(f"Pipeline de Random Forest: {np.mean(cv_scores_rf):.4f} (Média) ± {np.std(cv_scores_rf):.4f} (Desvio Padrão)")
print(f"Scores por fold: {cv_scores_rf}")
print("-" * 50)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Define the parameter grid for Logistic Regression
param_grid_lr = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'lbfgs']
}

# Define the parameter grid for Random Forest Classifier
param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV to find the best parameters for Logistic Regression
# Removing n_jobs=-1 to avoid the pickling error
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train, y_train)

# Use GridSearchCV to find the best parameters for Random Forest
# Removing n_jobs=-1 to avoid the pickling error
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Get the best estimators
best_lr = grid_search_lr.best_estimator_
best_rf = grid_search_rf.best_estimator_

print("Melhores Hiperparâmetros encontrados:")
print(f"Regressão Logística: {grid_search_lr.best_params_}")
print(f"Random Forest: {grid_search_rf.best_params_}")
print("-" * 50)

# Make predictions on the test set
y_pred_lr = best_lr.predict(X_test)
y_pred_rf = best_rf.predict(X_test)

# Evaluate and print results for Logistic Regression
print("\nAvaliação do modelo de Regressão Logística no conjunto de teste:")
print(f"Acurácia: {accuracy_score(y_test, y_pred_lr):.4f}")
print("\nRelatório de Classificação:")
print(classification_report(y_test, y_pred_lr))

# Plot Confusion Matrix for Logistic Regression
plt.figure(figsize=(8, 6))
cm_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', xticklabels=['OFF', 'ON'], yticklabels=['OFF', 'ON'])
plt.title('Matriz de Confusão - Regressão Logística')
plt.xlabel('Predito')
plt.ylabel('Verdadeiro')
plt.savefig('confusion_matrix_lr.png')
plt.show()

# Evaluate and print results for Random Forest
print("\nAvaliação do modelo de Random Forest no conjunto de teste:")
print(f"Acurácia: {accuracy_score(y_test, y_pred_rf):.4f}")
print("\nRelatório de Classificação:")
print(classification_report(y_test, y_pred_rf))

# Plot Confusion Matrix for Random Forest
plt.figure(figsize=(8, 6))
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', xticklabels=['OFF', 'ON'], yticklabels=['OFF', 'ON'])
plt.title('Matriz de Confusão - Random Forest')
plt.xlabel('Predito')
plt.ylabel('Verdadeiro')
plt.savefig('confusion_matrix_rf.png')
plt.show()

print("-" * 50)
print("Comparação Final de Modelos:")
print(f"Acurácia no Teste - Regressão Logística: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Acurácia no Teste - Random Forest: {accuracy_score(y_test, y_pred_rf):.4f}")
