# PyCaret Clustering — Wine Quality (features only)

Dataset: `winequalityN.csv`. Drop `quality` to cluster on features.

## Environment
Using Python 3.11 with `pycaret==3.3.0` inside the shared `.venv`. Notebook assumes data CSVs are present in `../data/` and GPU is available; if not, PyCaret will fall back to CPU.

In [1]:
import pandas as pd
from pycaret.clustering import *

csv_path = "../data/winequalityN.csv"
df = pd.read_csv(csv_path)
df = df.drop(columns=['quality'], errors='ignore')
print(df.head())

    type  fixed acidity  volatile acidity  citric acid  residual sugar  \
0  white            7.0              0.27         0.36            20.7   
1  white            6.3              0.30         0.34             1.6   
2  white            8.1              0.28         0.40             6.9   
3  white            7.2              0.23         0.32             8.5   
4  white            7.2              0.23         0.32             8.5   

   chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  \
0      0.045                 45.0                 170.0   1.0010  3.00   
1      0.049                 14.0                 132.0   0.9940  3.30   
2      0.050                 30.0                  97.0   0.9951  3.26   
3      0.058                 47.0                 186.0   0.9956  3.19   
4      0.058                 47.0                 186.0   0.9956  3.19   

   sulphates  alcohol  
0       0.45      8.8  
1       0.49      9.5  
2       0.44     10.1  
3       0.40  

In [2]:
s = setup(
    data=df,
    session_id=42,
    normalize=True,
    use_gpu=False,
    log_experiment=True,
    experiment_name='pycaret_clustering_wine',
)


Unnamed: 0,Description,Value
0,Session id,42
1,Original data shape,"(6497, 12)"
2,Transformed data shape,"(6497, 12)"
3,Numeric features,11
4,Categorical features,1
5,Rows with missing values,0.5%
6,Preprocess,True
7,Imputation type,simple
8,Numeric imputation,mean
9,Categorical imputation,mode


2025/12/09 03:03:29 INFO mlflow.tracking.fluent: Experiment with name 'pycaret_clustering_wine' does not exist. Creating a new experiment.


In [3]:
kmeans = create_model('kmeans')
plot_model(kmeans, plot='elbow', save=True)
plot_model(kmeans, plot='silhouette', save=True)
plot_model(kmeans, plot='tsne', save=True)
clustered = assign_model(kmeans)
print(clustered[['Cluster']].head())
save_model(kmeans, 'clustering_wine_model')

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1844,1723.8208,1.8371,0,0,0


     Cluster
0  Cluster 0
1  Cluster 3
2  Cluster 2
3  Cluster 0
4  Cluster 0
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['fixed acidity',
                                              'volatile acidity', 'citric acid',
                                              'residual sugar', 'chlorides',
                                              'free sulfur dioxide',
                                              'total sulfur dioxide', 'density',
                                              'pH', 'sulphates', 'alcohol'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['type'],
                                     transformer=Si...most_frequent'))),
                 ('ordinal_encoding',
                  TransformerWrapper(include=['type'],
                                     transformer=OrdinalEncoder(cols=['type'],
                                                                