In [1]:
!pip install kaggle
!kaggle datasets download mrmars1010/grape-quality -p /home/ubuntu/machine-learning-zoomcamp/MyMachineLearningRepo/Capstone1/data --unzip
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
import numpy as np
import joblib
import pickle

data = pd.read_csv("../data/GRAPE_QUALITY.csv")

data = data.drop('sample_id', axis = 1)
X = data.drop('quality_category', axis = 1)
y = data['quality_category']

# split data into train/va/test in the ratio 60/20/20
X_train, X_temp, y_train, y_temp = train_test_split(X,y,test_size=0.6,random_state=25)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=25)

#Data Preprocessing --split columns into categorical and numerical and then preprocess accordingly

cat_column = [col for col in X_train.columns if X_train[col].dtype == 'object']
num_column = [col for col in X_train.columns if X_train[col].dtype in ['int', 'float']]

print('Categorical Columns: ', cat_column)
print('Numerical Columns: ', num_column)



# Preprocessing for categorical and numerical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace missing values with the mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Replace missing values with the mean
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, ['region', 'variety']),
        ('num', numerical_transformer, ['sugar_content_brix', 'acidity_ph',
                                        'cluster_weight_g', 'berry_size_mm',
                                        'sun_exposure_hours', 'soil_moisture_percent', 
                                        'rainfall_mm'])
    ]
)

# GradientBoostingClassifier setup
gbc = GradientBoostingClassifier(n_estimators=500, min_samples_split=5, 
                                 min_samples_leaf=5, learning_rate=0.5, 
                                 random_state=2)

# Create a pipeline that preprocesses data then fits the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', gbc)
])


# Fit the pipeline
pipeline.fit(X_train, y_train)

# Save the pipeline
joblib.dump(pipeline, 'gradient_boosting_pipeline.pkl')

print("Model saved successfully!")


Dataset URL: https://www.kaggle.com/datasets/mrmars1010/grape-quality
License(s): apache-2.0
Downloading grape-quality.zip to /home/ubuntu/machine-learning-zoomcamp/MyMachineLearningRepo/Capstone1/data
  0%|                                               | 0.00/26.8k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 26.8k/26.8k [00:00<00:00, 18.4MB/s]
Categorical Columns:  ['variety', 'region', 'harvest_date']
Numerical Columns:  ['quality_score', 'sugar_content_brix', 'acidity_ph', 'cluster_weight_g', 'berry_size_mm', 'sun_exposure_hours', 'soil_moisture_percent', 'rainfall_mm']
Model saved successfully!


In [2]:
#load the model and make prediction on sample data
import joblib

# Load the pipeline
loaded_pipeline = joblib.load('gradient_boosting_pipeline.pkl')

print("Pipeline loaded successfully!")

# Example input data
new_data = pd.DataFrame({
    'region': ['Napa Valley'],
    'variety': ['Cabernet Sauvignon'],
    'sugar_content_brix': [22],
    'acidity_ph': [3.4],
    'cluster_weight_g': [150],
    'berry_size_mm': [8.5],
    'sun_exposure_hours': [180],
    'soil_moisture_percent': [20],
    'rainfall_mm': [120],
    'harvest_date': [None],  # Placeholder value
    'quality_score': [None]  # Placeholder value
})

# Make a prediction
prediction = loaded_pipeline.predict(new_data)
print(f"Predicted wine quality: {prediction[0]}")


Pipeline loaded successfully!
Predicted wine quality: High
