In [1]:
!pip install kaggle
!kaggle datasets download mrmars1010/grape-quality -p /home/ubuntu/machine-learning-zoomcamp/MyMachineLearningRepo/Capstone1/data --unzip 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import joblib

# Load data
data = pd.read_csv("../data/GRAPE_QUALITY.csv")
new_data = data.drop(columns=['sample_id', 'harvest_date'])

# Drop 'quality_category' and 'quality_score' columns from features
X = new_data.drop(columns=['quality_category', 'quality_score'], axis=1)

# Select 'quality_category' and 'quality_score' as target variables
y = new_data[['quality_category', 'quality_score']]

# Split data into train/val/test in the ratio 60/20/20
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=25)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=25)

# Data Preprocessing -- split columns into categorical and numerical and then preprocess accordingly
cat_column = [col for col in X_train.columns if X_train[col].dtype == 'object']
num_column = [col for col in X_train.columns if X_train[col].dtype in ['int', 'float']]

print('Categorical Columns:', cat_column)
print('Numerical Columns:', num_column)

# Preprocessing for categorical and numerical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat_column),
        ('num', numerical_transformer, num_column)
    ]
)

# Custom GradientBoosting model that handles both classification and regression
class CombinedGradientBoosting:
    def __init__(self, classification_model, regression_model):
        self.classification = classification_model
        self.regression = regression_model

    def fit(self, X, y):
        self.classification.fit(X, y['quality_category'])
        self.regression.fit(X, y['quality_score'])
        return self

    def predict(self, X):
        classification_preds = self.classification.predict(X)
        regression_preds = self.regression.predict(X)
        return pd.DataFrame({
            'quality_category': classification_preds,
            'quality_score': regression_preds
        })


# Instantiate models for classification and regression
classification_model = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=100,
    random_state=25
)

regression_model = GradientBoostingRegressor(
    learning_rate=0.1,
    n_estimators=100,
    random_state=25
)

# Multi-output model combining both
combined_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('combined_model', CombinedGradientBoosting(
        classification_model=classification_model,
        regression_model=regression_model
    ))
])

# Fit the pipeline
combined_pipeline.fit(X_train, y_train)

# Save the pipeline
joblib.dump(combined_pipeline, 'combined_gradient_boosting_pipeline.pkl')
print("Pipeline saved successfully!")


Dataset URL: https://www.kaggle.com/datasets/mrmars1010/grape-quality
License(s): apache-2.0
Downloading grape-quality.zip to /home/ubuntu/machine-learning-zoomcamp/MyMachineLearningRepo/Capstone1/data
  0%|                                               | 0.00/26.8k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 26.8k/26.8k [00:00<00:00, 19.3MB/s]
Categorical Columns: ['variety', 'region']
Numerical Columns: ['sugar_content_brix', 'acidity_ph', 'cluster_weight_g', 'berry_size_mm', 'sun_exposure_hours', 'soil_moisture_percent', 'rainfall_mm']
Pipeline saved successfully!


In [2]:
import pandas as pd
import joblib

# Load the saved pipeline
pipeline_path = 'combined_gradient_boosting_pipeline.pkl'
combined_pipeline = joblib.load(pipeline_path)

# Create a sample dataset
sample_data = pd.DataFrame({
    'variety': ['Variety1'],  # Example categorical data
    'region': ['Region1'],    # Example categorical data
    'sugar_content_brix': [18.5],
    'acidity_ph': [3.5],
    'cluster_weight_g': [500],
    'berry_size_mm': [7.5],
    'sun_exposure_hours': [8],
    'soil_moisture_percent': [45.0],
    'rainfall_mm': [300]
})

# Make predictions
predictions = combined_pipeline.predict(sample_data)

# Display predictions
if isinstance(predictions, list):
    prediction_dict = predictions[0]
    print("Quality Category:", prediction_dict['quality_category'])
    print("Quality Score:", prediction_dict['quality_score'])
else:
    print(predictions)


  quality_category  quality_score
0           Medium       1.984186


In [3]:
# Load the saved pipeline
loaded_pipeline = joblib.load('combined_gradient_boosting_pipeline.pkl')

# Test data (e.g., validation set)
predictions = loaded_pipeline.predict(X_val)
print(predictions.head())


  quality_category  quality_score
0             High       3.207554
1             High       2.818866
2           Medium       2.193050
3           Medium       2.684488
4           Medium       1.423621
