In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib 
import numpy as np

df = pd.read_csv('data/global_wf_175_crops_average_2010_2019.csv', skiprows=3)
print(df.head())
# Drop rows with missing values (or use df.fillna() if you prefer)
df = df.dropna()

# Features and target
X = df.drop(columns=['wf_tot_m3_t'])
y = np.log1p(df['wf_tot_m3_t'])

# Identify categorical and numerical features
categorical_features = ['crop_name', 'crop_group']
numerical_features = ['production_t', 'wfg_m3_t', 'wfb_cr_m3_t', 'wfb_i_m3_t']

# Preprocessing: encode categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # Keep numerical features as is
)

# Pipeline with preprocessing and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))

joblib.dump(model, 'models/water_footprint_predictor.pkl')
print("Model saved as water_footprint_predictor.pkl")

   crop_code     crop_name crop_group  production_t     wfg_m3_t  wfb_cr_m3_t  \
0         56  Maize (corn)    Cereals  1.024541e+09   657.166427     7.484206   
1        236    Soya beans  Oil crops  3.038541e+08  1548.750728    30.277155   
2         15         Wheat    Cereals  7.255114e+08   829.653601    29.621919   
3         27          Rice    Cereals  7.348570e+08   580.321894    57.548323   
4         83       Sorghum    Cereals  6.235406e+07  2583.405432     6.080188   

   wfb_i_m3_t  wf_tot_m3_t  
0   49.916426   714.567059  
1   21.518670  1600.546553  
2  154.042699  1013.318220  
3  308.904081   946.774298  
4  148.205587  2737.691206  
Mean Squared Error: 0.15576925170473016
R^2 Score: 0.9503771186678883
Model saved as water_footprint_predictor.pkl


In [15]:
df['crop_group'].unique()

array(['Cereals', 'Oil crops', 'Fibres', 'Roots', 'Pulses', 'Sugar crops',
       'Vegetables', 'Fruits', 'Stimulants', 'Others', 'Nuts', 'Spices',
       'Fodder crops'], dtype=object)