In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
pd.set_option('display.max_columns', 10)
data = pd.read_csv('fin_int.csv')
data

Unnamed: 0,Year,District,Latitude,Longitude,Precipitation(mm),...,Crop_Millet,Crop_Paddy,Crop_Wheat,Yield(kg/ha),Production(mt)
0,1982,Achham,29.11,81.3,665.15,...,0,0,0,800.0,400.0
1,1982,Achham,29.11,81.3,665.15,...,0,0,0,1200.0,3420.0
2,1982,Achham,29.11,81.3,665.15,...,1,0,0,1106.0,940.0
3,1982,Achham,29.11,81.3,665.15,...,0,1,0,1399.0,2700.0
4,1982,Achham,29.11,81.3,665.15,...,0,0,1,1050.0,3360.0
...,...,...,...,...,...,...,...,...,...,...,...
11620,2013,Udayapur,26.90,86.5,1091.43,...,0,0,0,1000.0,35.0
11621,2013,Udayapur,26.90,86.5,1091.43,...,0,0,0,2474.0,23500.0
11622,2013,Udayapur,26.90,86.5,1091.43,...,1,0,0,1938.0,5000.0
11623,2013,Udayapur,26.90,86.5,1091.43,...,0,1,0,3466.0,52850.0


In [3]:
data = data[data['Area(ha)'] != 0]
data = data[data['Yield(kg/ha)'] != 0]
data

Unnamed: 0,Year,District,Latitude,Longitude,Precipitation(mm),...,Crop_Millet,Crop_Paddy,Crop_Wheat,Yield(kg/ha),Production(mt)
0,1982,Achham,29.11,81.3,665.15,...,0,0,0,800.0,400.0
1,1982,Achham,29.11,81.3,665.15,...,0,0,0,1200.0,3420.0
2,1982,Achham,29.11,81.3,665.15,...,1,0,0,1106.0,940.0
3,1982,Achham,29.11,81.3,665.15,...,0,1,0,1399.0,2700.0
4,1982,Achham,29.11,81.3,665.15,...,0,0,1,1050.0,3360.0
...,...,...,...,...,...,...,...,...,...,...,...
11620,2013,Udayapur,26.90,86.5,1091.43,...,0,0,0,1000.0,35.0
11621,2013,Udayapur,26.90,86.5,1091.43,...,0,0,0,2474.0,23500.0
11622,2013,Udayapur,26.90,86.5,1091.43,...,1,0,0,1938.0,5000.0
11623,2013,Udayapur,26.90,86.5,1091.43,...,0,1,0,3466.0,52850.0


In [4]:
data = data.drop('District', axis=1)
data = data.drop('Yield(kg/ha)', axis=1)
# data = data.drop('Area(ha)', axis=1)
data

Unnamed: 0,Year,Latitude,Longitude,Precipitation(mm),Surface_Pressure(kPa),...,Crop_Maize,Crop_Millet,Crop_Paddy,Crop_Wheat,Production(mt)
0,1982,29.11,81.3,665.15,87.577863,...,0,0,0,0,400.0
1,1982,29.11,81.3,665.15,87.577863,...,1,0,0,0,3420.0
2,1982,29.11,81.3,665.15,87.577863,...,0,1,0,0,940.0
3,1982,29.11,81.3,665.15,87.577863,...,0,0,1,0,2700.0
4,1982,29.11,81.3,665.15,87.577863,...,0,0,0,1,3360.0
...,...,...,...,...,...,...,...,...,...,...,...
11620,2013,26.90,86.5,1091.43,94.958986,...,0,0,0,0,35.0
11621,2013,26.90,86.5,1091.43,94.958986,...,1,0,0,0,23500.0
11622,2013,26.90,86.5,1091.43,94.958986,...,0,1,0,0,5000.0
11623,2013,26.90,86.5,1091.43,94.958986,...,0,0,1,0,52850.0


In [5]:
selected_features = ['Year', 'Latitude', 'Longitude', 'Area(ha)', 'Precipitation(mm)', 'Temp_Range(C)',
#                      'Wet_Bulb_Temp(C)',
                     'Relative_Humidity(%)', 'Surface_Pressure(kPa)', 'Specific_Humidity(g/kg)', 'Crop_Barley', 'Crop_Maize', 'Crop_Millet', 'Crop_Paddy', 'Crop_Wheat', 'Production(mt)']
data_selected = data[selected_features].copy()
data_selected

Unnamed: 0,Year,Latitude,Longitude,Area(ha),Precipitation(mm),...,Crop_Maize,Crop_Millet,Crop_Paddy,Crop_Wheat,Production(mt)
0,1982,29.11,81.3,500.0,665.15,...,0,0,0,0,400.0
1,1982,29.11,81.3,2850.0,665.15,...,1,0,0,0,3420.0
2,1982,29.11,81.3,850.0,665.15,...,0,1,0,0,940.0
3,1982,29.11,81.3,1930.0,665.15,...,0,0,1,0,2700.0
4,1982,29.11,81.3,3200.0,665.15,...,0,0,0,1,3360.0
...,...,...,...,...,...,...,...,...,...,...,...
11620,2013,26.90,86.5,35.0,1091.43,...,0,0,0,0,35.0
11621,2013,26.90,86.5,9500.0,1091.43,...,1,0,0,0,23500.0
11622,2013,26.90,86.5,2580.0,1091.43,...,0,1,0,0,5000.0
11623,2013,26.90,86.5,15250.0,1091.43,...,0,0,1,0,52850.0


In [6]:
X = data_selected.drop("Production(mt)", axis=1)
y = data_selected["Production(mt)"]

In [7]:
split_index = int(len(data) * 0.9)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train_scaled, y_train)

In [10]:
y_pred = rf_model.predict(X_test_scaled)

In [11]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error, explained_variance_score

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

# Calculate R-squared Score (R2 Score)
r2 = r2_score(y_test, y_pred)
print("R-squared Score (R2 Score):", r2)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(y_test, y_pred)
print("Mean Absolute Percentage Error (MAPE):", mape)

# Calculate Explained Variance Score
evs = explained_variance_score(y_test, y_pred)
print("Explained Variance Score:", evs)

Mean Squared Error: 192097020.09901652
Root Mean Squared Error (RMSE): 13859.906929666466
Mean Absolute Error (MAE): 5227.104746614035
R-squared Score (R2 Score): 0.9026670510311811
Mean Absolute Percentage Error (MAPE): 0.4197331110184956
Explained Variance Score: 0.9078499445645871


In [16]:
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 4, 8],
    "min_samples_leaf": [1, 2, 4]
}

In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error, explained_variance_score

tscv = TimeSeriesSplit(n_splits=5)
rf_model = RandomForestRegressor()
grid_search = GridSearchCV(rf_model, param_grid, cv=tscv, verbose=2, scoring="neg_mean_squared_error")
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   6.1s
[CV] END ma

[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=100; total time=   3.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=200; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=200; total time=   2.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=200; total time=   3.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=200; total time=   5.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=200; total time=   6.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=300; total time=   1.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=300; total time=   3.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estimators=300; total time=   5.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=8, n_estim

[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=4, n_estimators=300; total time=   0.9s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=4, n_estimators=300; total time=   1.7s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=4, n_estimators=300; total time=   2.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=4, n_estimators=300; total time=   3.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=4, n_estimators=300; total time=   4.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=8, n_estimators=100; total time=   0.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=8, n_estimators=100; total time=   1.1s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=8, n_estimators=100; total time=   1.4s


[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=100; total time=   0.8s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=100; total time=   1.1s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=100; total time=   1.4s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time=   0.6s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time=   1.1s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time=   1.7s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time=   2.2s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time=   2.8s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=300; total time=   0.9s
[CV] END max_depth=5, min_samples_leaf=4, min_samples_split=4, n_estimators=300; total time=   1.7s


[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   5.2s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   1.5s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   3.0s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   4.6s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   6.1s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   7.8s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   1.0s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   1.5s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total tim

In [18]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 200}


In [20]:
y_pred = best_model.predict(X_test_scaled)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)

print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared Score (R2 Score):", r2)
print("Mean Absolute Percentage Error (MAPE):", mape)
print("Explained Variance Score:", evs)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Root Mean Squared Error (RMSE): 13843.656994214603
Mean Absolute Error (MAE): 5256.752545988143
R-squared Score (R2 Score): 0.9028951516883242
Mean Absolute Percentage Error (MAPE): 0.41880579003577423
Explained Variance Score: 0.9083429621845474
Mean Squared Error: 191646838.9734669
