In [1]:
from google.colab import drive
import os

# Mount Google Drive to access your files
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

# Define the path to your Project 4 data folder on Google Drive
# Make sure this path matches where you have saved your preprocessed data
PROJECT4_DRIVE_FOLDER = '/content/drive/MyDrive/Project4_Processed_Data'
print(f"Project 4 data will be accessed from: {PROJECT4_DRIVE_FOLDER}")

# You can also define your main project path for consistency if needed
# PROJECT_DRIVE_PATH = '/content/drive/MyDrive/Colab_Projects/CropProductionPrediction_Project'

Mounted at /content/drive
Google Drive mounted successfully.
Project 4 data will be accessed from: /content/drive/MyDrive/Project4_Processed_Data


In [2]:
import pandas as pd

# Define the path to your saved preprocessed data
processed_data_path = os.path.join(PROJECT4_DRIVE_FOLDER, 'project4_preprocessed_data.csv')

# Load the preprocessed data (it's now safely in Google Drive)
try:
    df_encoded = pd.read_csv(processed_data_path)
    print("Preprocessed data loaded successfully.")
except FileNotFoundError:
    print(f"Error: {processed_data_path} not found. Please ensure your preprocessed data file is saved in the correct location on Google Drive.")
    raise

# Display the first few rows and check the shape
print("\nFirst 5 rows of the preprocessed data:")
print(df_encoded.head())
print(f"\nDataset shape: {df_encoded.shape}")

Preprocessed data loaded successfully.

First 5 rows of the preprocessed data:
   Crop_Index  Production_Value  Area_Value  Yield_Value  Production_Quantity  \
0       112.0             140.3       115.2        122.8                 40.1   
1       112.0             140.3       115.2        122.8                 40.0   
2       112.0             140.3       115.2        122.8                 40.8   
3       112.0             140.3       115.2        122.8                 42.4   
4       112.0             140.3       115.2        122.8                 43.9   

   Cost of Cultivation (`/Hectare) A2+FL  Cost of Cultivation (`/Hectare) C2  \
0                                    NaN                                 NaN   
1                                    NaN                                 NaN   
2                                    NaN                                 NaN   
3                                    NaN                                 NaN   
4                                 

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib # For saving the model

# Separate features (X) and target (y)
# 'Production_Quantity' is your target variable
X = df_encoded.drop('Production_Quantity', axis=1)
y = df_encoded['Production_Quantity']

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor model
model_final = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
print("\nTraining the model... This might take a moment.")
model_final.fit(X_train, y_train)
print("Model training complete.")

# Make predictions on the test set
y_pred = model_final.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)

print(f"\n--- Final Model Performance ---")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

# Save the trained model
model_save_path = os.path.join(PROJECT4_DRIVE_FOLDER, 'random_forest_model.joblib')
joblib.dump(model_final, model_save_path)
print(f"\nTrained model saved to: {model_save_path}")


Training the model... This might take a moment.
Model training complete.

--- Final Model Performance ---
Mean Absolute Error (MAE): 1279.16
Mean Squared Error (MSE): 9205142.80
Root Mean Squared Error (RMSE): 3034.00
R-squared (R2): 0.72

Trained model saved to: /content/drive/MyDrive/Project4_Processed_Data/random_forest_model.joblib


In [4]:
import joblib # For loading models
import pandas as pd
import os

# Define the path to the saved model
model_save_path = os.path.join(PROJECT4_DRIVE_FOLDER, 'random_forest_model.joblib')

# Load the trained model
try:
    loaded_model = joblib.load(model_save_path)
    print("Trained model loaded successfully for final testing.")
except FileNotFoundError:
    print(f"Error: {model_save_path} not found. Please ensure the model was trained and saved.")
    raise

# Select a sample data point to test (e.g., the 5th row of the test set)
# We will get a sample from the test set to ensure it's unseen data
sample_features = X_test.iloc[[5]]
actual_value = y_test.iloc[5]

# Make a prediction
predicted_quantity = loaded_model.predict(sample_features)

print(f"\n--- Final Model Prediction on a Sample ---")
print(f"Predicted Production Quantity: {predicted_quantity[0]:.2f}")
print(f"Actual Production Quantity:    {actual_value:.2f}")
print(f"Difference (Absolute Error):   {abs(predicted_quantity[0] - actual_value):.2f}")

Trained model loaded successfully for final testing.

--- Final Model Prediction on a Sample ---
Predicted Production Quantity: 2264.51
Actual Production Quantity:    2802.00
Difference (Absolute Error):   537.49
