In [1]:
import pandas as pd
import numpy as np
import mlflow
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
df = pd.read_csv('Data/crop_yield.csv')
df.head()

Unnamed: 0,Region,Soil_Type,Crop,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Weather_Condition,Days_to_Harvest,Yield_tons_per_hectare
0,West,Sandy,Cotton,897.077239,27.676966,False,True,Cloudy,122,6.555816
1,South,Clay,Rice,992.673282,18.026142,True,True,Rainy,140,8.527341
2,North,Loam,Barley,147.998025,29.794042,False,False,Sunny,106,1.127443
3,North,Sandy,Soybean,986.866331,16.64419,False,True,Rainy,146,6.517573
4,South,Silt,Wheat,730.379174,31.620687,True,True,Cloudy,110,7.248251


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 10 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Region                  1000000 non-null  object 
 1   Soil_Type               1000000 non-null  object 
 2   Crop                    1000000 non-null  object 
 3   Rainfall_mm             1000000 non-null  float64
 4   Temperature_Celsius     1000000 non-null  float64
 5   Fertilizer_Used         1000000 non-null  bool   
 6   Irrigation_Used         1000000 non-null  bool   
 7   Weather_Condition       1000000 non-null  object 
 8   Days_to_Harvest         1000000 non-null  int64  
 9   Yield_tons_per_hectare  1000000 non-null  float64
dtypes: bool(2), float64(3), int64(1), object(4)
memory usage: 62.9+ MB


In [5]:
X = df.drop('Yield_tons_per_hectare', axis=1)
y = df['Yield_tons_per_hectare']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [7]:
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [8]:
mlflow.set_experiment('Crop Yield Pred')
# mlflow.sklearn.autolog()

<Experiment: artifact_location='file:///d:/Vegeta/Projects/ML%20projects/Crop%20yield%20pred/mlruns/951545069533513088', creation_time=1728662699529, experiment_id='951545069533513088', last_update_time=1728662699529, lifecycle_stage='active', name='Crop Yield Pred', tags={}>

In [9]:
def get_preprocessor( numeric_features, categorical_features, scaler, encoder):
    
    numeric_transformer = Pipeline(steps=[
        ('scaler', scaler)
    ])

    categorical_transformer = Pipeline(steps=[
        ('encoder', encoder)
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor


In [10]:
def get_trained_model(model, preprocessor, X_train, y_train):
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('model', model)
    ])
    
    model_pipeline.fit(X_train, y_train)
    
    return model_pipeline

def evaluate_model(pipe, X_test, y_test):
    y_pred = pipe.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return mse, mae, r2

In [11]:
def run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, run_name):
    with mlflow.start_run(run_name=run_name, log_system_metrics=True) as run:
        
        preprocessor = get_preprocessor(numeric_features, categorical_features, scaler, encoder)
        
        model_pipeline = get_trained_model(model, preprocessor, X_train, y_train)
        
        mse, mae, r2 = evaluate_model(model_pipeline, X_test, y_test)
        
        print('MSE: ', mse)
        print('MAE: ', mae)
        print('R2: ', r2)
        
        mlflow.log_params({
            'model': model.__class__.__name__,
            'scaler': scaler.__class__.__name__,
            'encoder': encoder.__class__.__name__,
            
        })
        
        mlflow.log_params(model.get_params())
        
        mlflow.log_metrics({
            'mse': mse,
            'mae': mae,
            'r2': r2
        })
        
        mlflow.sklearn.log_model(model_pipeline, 'model')
        
        mlflow.end_run()
        
        return run.info.run_id, model_pipeline

In [12]:
from sklearn.linear_model import LinearRegression

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = LinearRegression()

run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'Linear Regression Std Scaler OHE')


2024/10/12 14:29:38 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  1.1797099256765398
MAE:  0.8869317540171922
R2:  0.5907971884536327


2024/10/12 14:30:03 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 14:30:03 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [13]:
from sklearn.tree import DecisionTreeRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = DecisionTreeRegressor(random_state=42)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'Decision Tree Std Scaler OHE')


2024/10/12 14:30:03 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  2.4248933619620496
MAE:  1.259562242972337
R2:  0.15888375623698758


2024/10/12 14:30:49 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 14:30:49 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [None]:
from sklearn.svm import SVR

scaler =  StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = SVR(max_iter=1000)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'SVR Std Scaler OHE')

In [14]:
from sklearn.ensemble import RandomForestRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = RandomForestRegressor(random_state=42, n_estimators=50)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'Random Forest Std Scaler OHE')

2024/10/12 14:30:49 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  1.2703895369468479
MAE:  0.9202575268920458
R2:  0.5593433953862716


2024/10/12 14:54:53 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 14:54:53 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [17]:
from sklearn.ensemble import GradientBoostingRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = GradientBoostingRegressor(random_state=42, n_iter_no_change=5 )
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'Gradient Boosting Std Scaler OHE')

2024/10/12 15:01:51 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  1.1874534136996333
MAE:  0.8899772080536943
R2:  0.5881112255730472


2024/10/12 15:13:37 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 15:13:37 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [18]:
from sklearn.ensemble import AdaBoostRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = AdaBoostRegressor(random_state=42)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'AdaBoost Std Scaler OHE')


2024/10/12 15:18:18 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  1.2456484316214975
MAE:  0.9116664871006566
R2:  0.5679252761007975


2024/10/12 15:21:56 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 15:21:56 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [25]:
from sklearn.linear_model import ARDRegression

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = ARDRegression(max_iter=1000)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'ARDRegression Std Scaler OHE')

2024/10/12 15:26:28 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  1.17968007763191
MAE:  0.8869193512541889
R2:  0.5908075417646592


2024/10/12 15:26:34 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 15:26:34 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [26]:
from sklearn.linear_model import BayesianRidge

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = BayesianRidge(max_iter=1000)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'BayesianRidge Std Scaler OHE')

2024/10/12 15:26:40 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  1.1797098751974235
MAE:  0.8869317378409378
R2:  0.5907972059631881


2024/10/12 15:26:47 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 15:26:47 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [30]:
from sklearn.linear_model import HuberRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='error', drop='if_binary')
model = HuberRegressor(max_iter=1000)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'HuberRegressor Std Scaler OHE')

2024/10/12 15:28:38 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  1.1797202434185972
MAE:  0.8869361019973662
R2:  0.5907936095661732


2024/10/12 15:28:47 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 15:28:47 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [28]:
from sklearn.linear_model import Lasso

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = Lasso(max_iter=1000)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'Lasso Std Scaler OHE')

2024/10/12 15:27:42 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  2.8829519189029607
MAE:  1.3896345286340903
R2:  -1.7844145416834323e-06


2024/10/12 15:27:48 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 15:27:48 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [31]:
from sklearn.linear_model import Ridge

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = Ridge(max_iter=1000)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'Ridge Std Scaler OHE')

2024/10/12 15:58:21 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  1.1797099204156143
MAE:  0.8869317523386885
R2:  0.5907971902784758


2024/10/12 15:58:30 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 15:58:30 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [32]:
from sklearn.neural_network import MLPRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = MLPRegressor(max_iter=1000, random_state=42, early_stopping=True, n_iter_no_change=5, hidden_layer_sizes=(100, 50))
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'MLPRegressor Std Scaler OHE')


2024/10/12 15:59:41 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  1.1815895619274002
MAE:  0.8878108069759338
R2:  0.590145204078664


2024/10/12 16:06:04 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 16:06:04 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [34]:
from xgboost import XGBRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = XGBRegressor(random_state=42, n_estimators=50)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'XGBRegressor Std Scaler OHE')

2024/10/12 16:19:29 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  1.1851684574186658
MAE:  0.8890296746746182
R2:  0.588903802217595


2024/10/12 16:19:45 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 16:19:45 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [36]:
from lightgbm import LGBMRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = LGBMRegressor(random_state=42)
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'LGBMRegressor Std Scaler OHE')

2024/10/12 16:20:08 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 14
[LightGBM] [Info] Start training from score 4.649019
MSE:  1.1822725608210551
MAE:  0.8879077126369004
R2:  0.5899082940879251


2024/10/12 16:20:23 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 16:20:23 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [37]:
from sklearn.neural_network import MLPRegressor

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
model = MLPRegressor(max_iter=1000, random_state=42, early_stopping=True, n_iter_no_change=5, hidden_layer_sizes=(100, 200, 50))
run_id, model_pipeline = run_experiment(model, scaler, encoder, X_train, y_train, X_test, y_test, 'MLPRegressor 100-200-50 Std Scaler OHE')


2024/10/12 16:21:03 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


MSE:  1.181920571120471
MAE:  0.8877847194370275
R2:  0.5900303878094234


2024/10/12 16:27:52 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/12 16:27:52 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
