In [2]:
import pandas as pd
import numpy as np
data = pd.read_csv("GRAPE_QUALITY.csv")

In [3]:
data.dtypes

sample_id                  int64
variety                   object
region                    object
quality_score            float64
quality_category          object
sugar_content_brix       float64
acidity_ph               float64
cluster_weight_g         float64
berry_size_mm            float64
harvest_date              object
sun_exposure_hours       float64
soil_moisture_percent    float64
rainfall_mm              float64
dtype: object

In [4]:
data.isnull().sum()

sample_id                0
variety                  0
region                   0
quality_score            0
quality_category         0
sugar_content_brix       0
acidity_ph               0
cluster_weight_g         0
berry_size_mm            0
harvest_date             0
sun_exposure_hours       0
soil_moisture_percent    0
rainfall_mm              0
dtype: int64

In [5]:
# Dropping non-predictive columns (e.g., 'sample_id', 'harvest_date', 'quality_category')
data_cleaned = data.drop(columns=['sample_id', 'harvest_date', 'quality_category'])



In [6]:
from pycaret.regression import *

# Initialize PyCaret setup
regression_setup = setup(
    data=data_cleaned,
    target='quality_score',             # Target variable
    session_id=123,                     # For reproducibility
    normalize=True,                     # Normalize numeric features
    categorical_features=['variety', 'region']  # Specify categorical columns
)




Unnamed: 0,Description,Value
0,Session id,123
1,Target,quality_score
2,Target type,Regression
3,Original data shape,"(1000, 10)"
4,Transformed data shape,"(1000, 24)"
5,Transformed train set shape,"(700, 24)"
6,Transformed test set shape,"(300, 24)"
7,Numeric features,7
8,Categorical features,2
9,Preprocess,True


In [7]:
best_model = compare_models(sort='RMSE')


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.0465,0.0036,0.0595,0.987,0.0181,0.0201,0.108
lightgbm,Light Gradient Boosting Machine,0.0474,0.0038,0.0612,0.9863,0.0187,0.0206,0.168
lar,Least Angle Regression,0.0522,0.0042,0.0646,0.9845,0.0188,0.0219,0.035
br,Bayesian Ridge,0.0522,0.0042,0.0646,0.9845,0.0188,0.0219,0.029
ridge,Ridge Regression,0.0522,0.0042,0.0646,0.9845,0.0188,0.0219,0.033
huber,Huber Regressor,0.0523,0.0042,0.065,0.9843,0.0188,0.0219,0.037
lr,Linear Regression,0.0529,0.0043,0.0654,0.984,0.019,0.0222,1.207
par,Passive Aggressive Regressor,0.0545,0.0045,0.067,0.9833,0.0196,0.0229,0.033
et,Extra Trees Regressor,0.0623,0.0065,0.0801,0.9766,0.0246,0.0273,0.203
rf,Random Forest Regressor,0.0747,0.0089,0.0939,0.9679,0.0286,0.0326,0.218


In [21]:
tuned_model = tune_model(best_model)

type(tuned_model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0555,0.0052,0.0723,0.985,0.0216,0.0235
1,0.0683,0.0072,0.0846,0.9739,0.0245,0.028
2,0.0571,0.0053,0.0731,0.9808,0.0212,0.0235
3,0.0687,0.0069,0.0829,0.9738,0.0256,0.0309
4,0.063,0.0061,0.0781,0.9796,0.0222,0.0259
5,0.0719,0.0075,0.0865,0.975,0.0257,0.0299
6,0.0468,0.0038,0.0616,0.9842,0.0193,0.0209
7,0.063,0.0063,0.0795,0.9699,0.0231,0.0263
8,0.0517,0.0043,0.0654,0.9811,0.0188,0.0217
9,0.0717,0.009,0.0947,0.9736,0.0323,0.0352


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


sklearn.ensemble._gb.GradientBoostingRegressor

In [9]:
evaluate_model(tuned_model)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [10]:
#Make predictions

predictions = predict_model(tuned_model)
print(predictions)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,0.0444,0.0033,0.0577,0.9877,0.0167,0.0183


                variety          region  sugar_content_brix  acidity_ph  \
131     Sauvignon Blanc          Sonoma           19.920000        2.82   
203  Cabernet Sauvignon          Sonoma           16.469999        2.86   
50             Riesling          Sonoma           27.490000        3.44   
585            Riesling          Sonoma           12.730000        3.67   
138            Riesling         Tuscany           19.610001        3.06   
..                  ...             ...                 ...         ...   
556           Zinfandel          Sonoma           22.219999        3.53   
598          Pinot Noir           Rioja           11.710000        3.20   
458              Merlot  Barossa Valley           14.310000        3.48   
586               Syrah     Napa Valley           23.900000        2.90   
178  Cabernet Sauvignon    Loire Valley           25.219999        3.77   

     cluster_weight_g  berry_size_mm  sun_exposure_hours  \
131        296.899994      13.270000   

In [11]:
# Prepare new data in the same format as the training data and use
#Define the new_data

import pandas as pd

# Example of new data (replace with actual data)
new_data = pd.DataFrame({
    'sugar_content_brix': [22.5, 24.3],
    'acidity_ph': [3.5, 3.7],
    'cluster_weight_g': [120, 140],
    'berry_size_mm': [15, 18],
    'sun_exposure_hours': [8, 10],
    'soil_moisture_percent': [25, 30],
    'rainfall_mm': [200, 180],
    'variety': ['Cabernet Sauvignon', 'Merlot'],
    'region': ['Napa Valley', 'Sonoma']
})

new_data_predictions = predict_model(tuned_model, data=new_data)
print(new_data_predictions)


   sugar_content_brix  acidity_ph  cluster_weight_g  berry_size_mm  \
0           22.500000         3.5               120             15   
1           24.299999         3.7               140             18   

   sun_exposure_hours  soil_moisture_percent  rainfall_mm             variety  \
0                   8                     25          200  Cabernet Sauvignon   
1                  10                     30          180              Merlot   

        region  prediction_label  
0  Napa Valley          2.416202  
1       Sonoma          2.977378  


In [22]:
import joblib
joblib.dump(tuned_model, 'best_grape_quality_model.pkl')


['best_grape_quality_model.pkl']

GradientBoostingRegressor(random_state=123)


In [15]:
from sklearn.ensemble import GradientBoostingRegressor
import joblib

model = GradientBoostingRegressor(random_state=123)
# Fit your model
# model.fit(X_train, y_train)

# Save the model
joblib.dump(tuned_model, 'gradient_boosting_model.pkl')


['gradient_boosting_model.pkl']

In [16]:
print(type(tuned_model))

<class 'sklearn.ensemble._gb.GradientBoostingRegressor'>


In [18]:

import sklearn
print(sklearn.__version__)



1.4.2


In [19]:
import joblib
print(joblib.__version__)


1.3.2


In [20]:
from sklearn.ensemble import GradientBoostingRegressor
import joblib

# Training and saving the model
tuned_model = GradientBoostingRegressor(random_state=123)
# model.fit(X_train, y_train)
joblib.dump(tuned_model, 'tuned_gradient_boosting_model.pkl')

# Loading the model
tuned_model = joblib.load('tuned_gradient_boosting_model.pkl')
print(type(tuned_model))  # Should output <class 'sklearn.ensemble._gb.GradientBoostingRegressor'>


<class 'sklearn.ensemble._gb.GradientBoostingRegressor'>


In [23]:
tuned_model.feature_names_in_


array(['variety_Cabernet Sauvignon', 'variety_Merlot', 'variety_Riesling',
       'variety_Syrah', 'variety_Pinot Noir', 'variety_Zinfandel',
       'variety_Chardonnay', 'variety_Sauvignon Blanc', 'region_Bordeaux',
       'region_Loire Valley', 'region_Barossa Valley', 'region_Tuscany',
       'region_Mendoza', 'region_Rioja', 'region_Sonoma',
       'region_Napa Valley', 'sugar_content_brix', 'acidity_ph',
       'cluster_weight_g', 'berry_size_mm', 'sun_exposure_hours',
       'soil_moisture_percent', 'rainfall_mm'], dtype=object)