In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
import joblib

In [2]:
# Load the dataset
df = pd.read_csv('seasonal_coconut_yield_data_4.csv')
df.head(2)

Unnamed: 0,Date,Soil Moisture (10 cm) (%),Soil Moisture (20 cm) (%),Soil Moisture (30 cm) (%),Plant Age (years),Temperature (°C),Humidity (%),Rainfall (mm),Rain Status (0/1),Coconut Count
0,1930-05-31,19.176048,25.028576,26.2835,5,28.567321,70.670239,11.748676,1,329
1,1930-06-30,26.159216,45.926485,44.534909,4,27.697496,61.141737,8.014426,1,298


In [3]:
X = df.drop(['Date','Rain Status (0/1)','Coconut Count'], axis=1) 
y = df['Coconut Count']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Initialize models
models = {
   'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
   'XGBoost': XGBRegressor(n_estimators=100, random_state=42),
   'SVR': SVR(kernel='rbf', C=1.0, epsilon=0.1),
   'ANN': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42),
   'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

In [6]:
from sklearn.metrics import mean_squared_error

def calculate_accuracy(y_true, y_pred):
   mse = mean_squared_error(y_true, y_pred)
   rmse = np.sqrt(mse)
   accuracy = 100 * (1 - rmse / np.mean(y_true))
   return max(0, accuracy)

In [7]:
results = {}
best_score = float('-inf')
best_model = None

for name, model in models.items():
   model.fit(X_train, y_train)
   y_pred = model.predict(X_test)
   
   accuracy = calculate_accuracy(y_test, y_pred)
   r2 = r2_score(y_test, y_pred) * 100  # Convert to percentage
   cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
   cv_percentage = cv_scores.mean() * 100
   
   results[name] = {
       'Accuracy': accuracy,
       'R2_Percentage': r2,
       'CV_Accuracy': cv_percentage,
       'CV_std': cv_scores.std() * 100
   }
   
   if cv_scores.mean() > best_score:
       best_score = cv_scores.mean()
       best_model = model

In [8]:
# Print results
print("Model Comparison Results:")
for name, metrics in results.items():
    print(f"\n{name}:")
    print(f"Accuracy: {metrics['Accuracy']:.2f}%")
    print(f"R2 Score: {metrics['R2_Percentage']:.2f}%")
    print(f"Cross-val R2: {metrics['CV_Accuracy']:.2f}% (+/- {metrics['CV_std']*2:.2f}%)")

print("\nBest Model Selected:", 
      [name for name, model in models.items() if model == best_model][0])

Model Comparison Results:

Random Forest:
Accuracy: 88.24%
R2 Score: -2.66%
Cross-val R2: -6.17% (+/- 12.32%)

XGBoost:
Accuracy: 87.24%
R2 Score: -20.83%
Cross-val R2: -25.20% (+/- 26.09%)

SVR:
Accuracy: 88.36%
R2 Score: -0.50%
Cross-val R2: -0.47% (+/- 0.33%)

ANN:
Accuracy: 87.02%
R2 Score: -25.12%
Cross-val R2: -24.22% (+/- 12.67%)

Gradient Boosting:
Accuracy: 87.97%
R2 Score: -7.42%
Cross-val R2: -7.84% (+/- 7.34%)

Best Model Selected: SVR


In [9]:
# Save the model to a file
joblib.dump(best_model, 'best_yield_predictor.pkl')

['best_yield_predictor.pkl']

In [12]:
# Load the model from the file
loaded_model = joblib.load('best_yield_predictor.pkl')

# Example new input data (as a DataFrame)
new_data = pd.DataFrame({
    'Soil Moisture (10 cm) (%)': [19.89],
    'Soil Moisture (20 cm) (%)': [41.67],
    'Soil Moisture (30 cm) (%)': [34.82],
    'Plant Age (years)':[3],
    'Temperature (°C)':[32.68],
    'Humidity (%)':[60.87],
    'Rainfall (mm)':[0],  # Corrected feature name
})

# Predict the oil yield for new data
predicted_yield = loaded_model.predict(new_data)
print("\nPredicted Coconut Yield:", predicted_yield[0])


Predicted Coconut Yield: 298.6559923103624
