In [1]:
import pandas as pd
import numpy as np
import mlflow
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
df = pd.read_csv('Data/crop_yield.csv')
df.head()

Unnamed: 0,Region,Soil_Type,Crop,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Weather_Condition,Days_to_Harvest,Yield_tons_per_hectare
0,West,Sandy,Cotton,897.077239,27.676966,False,True,Cloudy,122,6.555816
1,South,Clay,Rice,992.673282,18.026142,True,True,Rainy,140,8.527341
2,North,Loam,Barley,147.998025,29.794042,False,False,Sunny,106,1.127443
3,North,Sandy,Soybean,986.866331,16.64419,False,True,Rainy,146,6.517573
4,South,Silt,Wheat,730.379174,31.620687,True,True,Cloudy,110,7.248251


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 10 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Region                  1000000 non-null  object 
 1   Soil_Type               1000000 non-null  object 
 2   Crop                    1000000 non-null  object 
 3   Rainfall_mm             1000000 non-null  float64
 4   Temperature_Celsius     1000000 non-null  float64
 5   Fertilizer_Used         1000000 non-null  bool   
 6   Irrigation_Used         1000000 non-null  bool   
 7   Weather_Condition       1000000 non-null  object 
 8   Days_to_Harvest         1000000 non-null  int64  
 9   Yield_tons_per_hectare  1000000 non-null  float64
dtypes: bool(2), float64(3), int64(1), object(4)
memory usage: 62.9+ MB


In [5]:
X = df.drop('Yield_tons_per_hectare', axis=1)
y = df['Yield_tons_per_hectare']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [15]:
bool_features = X.select_dtypes(include=['bool']).columns
X[bool_features] = X[bool_features].astype('int')

numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(include=['object']).columns
print('Numeric features:', numeric_features)
print('Categorical features:', categorical_features)


Numeric features: Index(['Rainfall_mm', 'Temperature_Celsius', 'Fertilizer_Used',
       'Irrigation_Used', 'Days_to_Harvest'],
      dtype='object')
Categorical features: Index(['Region', 'Soil_Type', 'Crop', 'Weather_Condition'], dtype='object')


In [18]:
mlflow.set_experiment('Crop Yield Prediction')
# mlflow.sklearn.autolog()

2024/10/12 21:07:43 INFO mlflow.tracking.fluent: Experiment with name 'Crop Yield Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///d:/Vegeta/Projects/ML%20projects/Crop%20yield%20pred/mlruns/449104772617806357', creation_time=1728747463745, experiment_id='449104772617806357', last_update_time=1728747463745, lifecycle_stage='active', name='Crop Yield Prediction', tags={}>

In [19]:
def get_preprocessor( numeric_features, categorical_features, scaler, encoder):
    
    numeric_transformer = Pipeline(steps=[
        ('scaler', scaler)
    ])

    categorical_transformer = Pipeline(steps=[
        ('encoder', encoder)
    ])
    
    

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor


In [20]:
def get_trained_model(model, preprocessor, X_train, y_train):
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('model', model)
    ])
    
    model_pipeline.fit(X_train, y_train)
    
    return model_pipeline

def evaluate_model(pipe, X_test, y_test):
    y_pred = pipe.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return mse, mae, r2

In [21]:
def run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, run_name):
    with mlflow.start_run(run_name=run_name, log_system_metrics=True) as run:
        
        preprocessor = get_preprocessor(numeric_features, categorical_features, scaler, encoder)
        
        model_pipeline = get_trained_model(model, preprocessor, X_train, y_train)
        
        mse, mae, r2 = evaluate_model(model_pipeline, X_test, y_test)
        
        print('MSE: ', mse)
        print('MAE: ', mae)
        print('R2: ', r2)
        
        mlflow.log_params({
            'model': model.__class__.__name__,
            'scaler': scaler.__class__.__name__,
            'encoder': encoder.__class__.__name__,
            
        })
        
        mlflow.log_params(model.get_params())
        
        mlflow.log_metrics({
            'mse': mse,
            'mae': mae,
            'r2': r2
        })
        
        mlflow.sklearn.log_model(model_pipeline, 'model')
        
        mlflow.end_run()
        
        return run.info.run_id, model_pipeline

In [22]:
from sklearn.linear_model import LinearRegression

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = LinearRegression()

run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'Linear Regression Std Scaler OHE')


2024/10/12 21:07:53 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  0.2507778567822869
MAE:  0.39955657851757254
R2:  0.9130133587627195


2024/10/12 21:08:07 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 21:08:07 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [23]:
from sklearn.tree import DecisionTreeRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = DecisionTreeRegressor(random_state=42)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'Decision Tree Std Scaler OHE')


2024/10/12 21:08:10 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  0.5281797551154109
MAE:  0.5801357313881056
R2:  0.8167917077826142


2024/10/12 21:09:02 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 21:09:02 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [24]:
from sklearn.svm import SVR

scaler =  StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = SVR(max_iter=1000)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'SVR Std Scaler OHE')

2024/10/12 21:09:14 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  0.8565650249822847
MAE:  0.765128808902882
R2:  0.7028855917322001


2024/10/12 21:11:49 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 21:11:49 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [25]:
from sklearn.ensemble import RandomForestRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = RandomForestRegressor(random_state=42, n_estimators=50)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'Random Forest Std Scaler OHE')

2024/10/12 21:11:50 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  0.2692433867989182
MAE:  0.4139207056914438
R2:  0.9066082699904384


2024/10/12 21:47:46 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 21:47:46 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [26]:
from sklearn.ensemble import GradientBoostingRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = GradientBoostingRegressor(random_state=42, n_iter_no_change=5 )
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'Gradient Boosting Std Scaler OHE')

2024/10/12 21:47:47 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  0.26991574649034444
MAE:  0.4144749686260899
R2:  0.9063750503911845


2024/10/12 22:14:02 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 22:14:02 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [27]:
from sklearn.ensemble import AdaBoostRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = AdaBoostRegressor(random_state=42)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'AdaBoost Std Scaler OHE')


2024/10/12 22:14:03 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  0.3519805757142988
MAE:  0.4730805039290763
R2:  0.8779094436207265


2024/10/12 22:23:29 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 22:23:29 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [28]:
from sklearn.linear_model import HuberRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='error', drop='if_binary')
model = HuberRegressor(max_iter=1000)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'HuberRegressor Std Scaler OHE')

2024/10/12 22:23:29 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  0.2507810141452784
MAE:  0.3995563053210456
R2:  0.9130122635767037


2024/10/12 22:23:46 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 22:23:46 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [29]:
from sklearn.neural_network import MLPRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = MLPRegressor(max_iter=1000, random_state=42, early_stopping=True, n_iter_no_change=5, hidden_layer_sizes=(100, 50))
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'MLPRegressor Std Scaler OHE')


2024/10/12 22:23:46 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  0.2520021517955861
MAE:  0.40053497381459774
R2:  0.9125886908416482


2024/10/12 22:30:42 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 22:30:42 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [30]:
from xgboost import XGBRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = XGBRegressor(random_state=42, n_estimators=50)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'XGBRegressor Std Scaler OHE')

2024/10/12 22:30:43 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  0.2646346994174059
MAE:  0.410474859532941
R2:  0.9082068729969965


2024/10/12 22:30:59 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 22:31:00 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [31]:
from lightgbm import LGBMRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = LGBMRegressor(random_state=42)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'LGBMRegressor Std Scaler OHE')

2024/10/12 22:31:02 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.119423 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4080
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 16
[LightGBM] [Info] Start training from score 4.649019
MSE:  0.25795331513791236
MAE:  0.4053661653648108
R2:  0.9105244268063557


2024/10/12 22:31:23 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 22:31:23 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
