In [2]:
import pandas as pd
df = pd.read_csv("D:\\Wine quality project\\winequality-red.csv",)


In [None]:

# Keep ALL features XGBoost pipeline

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


print("Columns used:")
print(list(df.columns))
print()


if "quality" not in df.columns:
    raise KeyError("No 'quality' column found in df. Check your earlier preprocessing.")

X = df.drop(columns=["quality"])
y_raw = df["quality"]


le = LabelEncoder()
y = le.fit_transform(y_raw)

print("Label mapping (original -> encoded):")
print(dict(zip(le.classes_, le.transform(le.classes_))))
print()


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("xgb", XGBClassifier(
        eval_metric="mlogloss",
        random_state=42,
        tree_method="hist"  # comment out if causes issues
    ))
])


pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)

print("Accuracy (keep all features):", accuracy_score(y_test, y_pred))
print("\nClassification Report (encoded labels):\n")
print(classification_report(y_test, y_pred))


y_pred_original = le.inverse_transform(y_pred)
print("\nFirst 10 predicted wine qualities (original scale):")
print(y_pred_original[:10])


  import pkg_resources


Columns used:
['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']

Label mapping (original -> encoded):
{np.int64(3): np.int64(0), np.int64(4): np.int64(1), np.int64(5): np.int64(2), np.int64(6): np.int64(3), np.int64(7): np.int64(4), np.int64(8): np.int64(5)}

Accuracy (keep all features): 0.6625

Classification Report (encoded labels):

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      0.09      0.15        11
           2       0.72      0.72      0.72       136
           3       0.62      0.69      0.65       128
           4       0.71      0.60      0.65        40
           5       0.33      0.33      0.33         3

    accuracy                           0.66       320
   macro avg       0.48      0.41      0.42       320
weighted avg       0.66      0.66      0

<span style="color:yellow; font-weight:bold;">
Removing all negative correlation features , based on heatmap
</span>


In [None]:

# Remove all negatively correlated features
# Train XGBoost with preprocessing

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report




neg_corr_features = [
    "volatile acidity",
    "chlorides",
    "density",
    "pH",
    "free sulfur dioxide",
    "total sulfur dioxide"
]

df2 = df.drop(columns=neg_corr_features, errors="ignore")


X = df2.drop(columns=["quality"])
y_raw = df2["quality"]


le = LabelEncoder()
y = le.fit_transform(y_raw)

print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("xgb", XGBClassifier(
        eval_metric="mlogloss",
        random_state=42,
        tree_method="hist"
    ))
])


pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


y_pred_original = le.inverse_transform(y_pred)
print("\nPredicted qualities (original scale):")
print(y_pred_original[:10])


Label mapping: {np.int64(3): np.int64(0), np.int64(4): np.int64(1), np.int64(5): np.int64(2), np.int64(6): np.int64(3), np.int64(7): np.int64(4), np.int64(8): np.int64(5)}
Accuracy: 0.659375

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      0.09      0.14        11
           2       0.69      0.78      0.73       136
           3       0.64      0.66      0.65       128
           4       0.68      0.47      0.56        40
           5       0.20      0.33      0.25         3

    accuracy                           0.66       320
   macro avg       0.42      0.39      0.39       320
weighted avg       0.65      0.66      0.65       320


Predicted qualities (original scale):
[7 6 5 6 6 5 6 5 5 8]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


<span style="color:red; font-weight:bold;">
XGBOOST and Imputer, 3 % Improvement
</span>


In [None]:

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report




cols_to_drop = ["volatile acidity", "chlorides", "density", "total sulfur dioxide"]
df2 = df.drop(columns=cols_to_drop, errors="ignore")


if "quality" not in df2.columns:
    raise KeyError("No 'quality' column found in df. Check your earlier preprocessing.")

X = df2.drop(columns=["quality"])
y_raw = df2["quality"]


le = LabelEncoder()
y = le.fit_transform(y_raw)

print("Label mapping (original -> encoded):")
print(dict(zip(le.classes_, le.transform(le.classes_))))
print()


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("xgb", XGBClassifier(
        eval_metric="mlogloss",
        random_state=42,
        tree_method="hist"  
    ))
])


pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report (encoded labels):\n")
print(classification_report(y_test, y_pred))


y_pred_original = le.inverse_transform(y_pred)
print("\nFirst 10 predicted wine qualities (original scale):")
print(y_pred_original[:10])


Label mapping (original -> encoded):
{np.int64(3): np.int64(0), np.int64(4): np.int64(1), np.int64(5): np.int64(2), np.int64(6): np.int64(3), np.int64(7): np.int64(4), np.int64(8): np.int64(5)}

Accuracy: 0.65

Classification Report (encoded labels):

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.40      0.18      0.25        11
           2       0.70      0.76      0.73       136
           3       0.63      0.63      0.63       128
           4       0.58      0.53      0.55        40
           5       0.50      0.33      0.40         3

    accuracy                           0.65       320
   macro avg       0.47      0.41      0.43       320
weighted avg       0.64      0.65      0.64       320


First 10 predicted wine qualities (original scale):
[7 5 5 6 5 6 6 5 6 8]


<span style="color:green; font-weight:bold;">
collapsing classes to 3 , low|medium|high . resulted in 20% improvement
</span>


Based on Kaggle website, it is advised to consider high and low classification for the dataset 

In [None]:
#  Collapse classification to 3 classes
df["quality_3class"] = df["quality"].apply(
    lambda q: 0 if q <= 4 else (1 if q <= 6 else 2)
)

print(df["quality_3class"].value_counts())


quality_3class
1    1319
2     217
0      63
Name: count, dtype: int64


In [7]:
from sklearn.model_selection import train_test_split

X = df.drop(["quality", "quality_3class"], axis=1)
y3 = df["quality_3class"]

X_train3, X_test3, y_train3, y_test3 = train_test_split(
    X, y3,
    test_size=0.2,
    random_state=42,
    stratify=y3
)


In [None]:


import sys, subprocess


try:
    import xgboost
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "xgboost"])
    import xgboost


import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier



csv_path = r"D:\Wine quality project\winequality-red.csv"
df = pd.read_csv(csv_path)

print("Data shape:", df.shape)
print("Columns:", df.columns.tolist())


X_full = df.drop("quality", axis=1)
y_full = df["quality"]


le = LabelEncoder()
y_full_enc = le.fit_transform(y_full)

X_train6, X_test6, y_train6, y_test6 = train_test_split(
    X_full, y_full_enc,
    test_size=0.2,
    random_state=42,
    stratify=y_full_enc
)

print("\n6-class train/test shapes:", X_train6.shape, X_test6.shape)

#  GridSearchCV to find best XGB params on 6-class task
six_class_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", XGBClassifier(
        objective="multi:softmax",
        num_class=len(le.classes_),
        eval_metric="mlogloss",
        random_state=42
    ))
])

param_grid = {
    "model__n_estimators": [200, 400, 600],
    "model__max_depth": [4, 6, 8],
    "model__learning_rate": [0.03, 0.05],
    "model__subsample": [0.8, 1.0],
    "model__colsample_bytree": [0.8, 1.0],
}

grid = GridSearchCV(
    estimator=six_class_pipe,
    param_grid=param_grid,
    scoring="accuracy",
    cv=3,
    n_jobs=-1,
    verbose=2
)

print("\nFitting 6-class GridSearchCV...")
grid.fit(X_train6, y_train6)

print("\nBest 6-class params:", grid.best_params_)
print("Best 6-class CV accuracy:", grid.best_score_)

# Clean params dictionary (remove 'model__' prefix)
best_params_clean = {k.replace("model__", ""): v for k, v in grid.best_params_.items()}
print("\nClean best params for reuse:", best_params_clean)

#  Build 3-class target (Low / Medium / High) 
# Low: quality 3-4 >> 0
# Medium: 5-6       >> 1
# High: 7-8         >> 2
df["quality_3class"] = df["quality"].apply(
    lambda q: 0 if q <= 4 else (1 if q <= 6 else 2)
)

print("\n3-class value counts:")
print(df["quality_3class"].value_counts())

X3 = df.drop(["quality", "quality_3class"], axis=1)
y3 = df["quality_3class"]

X_train3, X_test3, y_train3, y_test3 = train_test_split(
    X3, y3,
    test_size=0.2,
    random_state=42,
    stratify=y3
)

print("\n3-class train/test shapes:", X_train3.shape, X_test3.shape)

#  Train final 3-class XGBoost using best_params_clean ----
# Impute
imputer = SimpleImputer(strategy="median")
X_train3_imp = imputer.fit_transform(X_train3)
X_test3_imp  = imputer.transform(X_test3)

# Clean params so we can override objective/num_class/eval_metric/random_state
xgb3_params = best_params_clean.copy()
for k in ["objective", "num_class", "eval_metric", "random_state"]:
    xgb3_params.pop(k, None)

xgb3 = XGBClassifier(
    **xgb3_params,
    objective="multi:softmax",
    num_class=3,
    eval_metric="mlogloss",
    random_state=42
)

print("\nFitting final 3-class XGBoost...")
xgb3.fit(X_train3_imp, y_train3)

#  Evaluate 3-class model 
y3_pred = xgb3.predict(X_test3_imp)

print("\n=== 3-CLASS XGBOOST RESULTS (Low / Medium / High) ===")
print("3-class accuracy:", accuracy_score(y_test3, y3_pred))
print("\nClassification report:\n", classification_report(y_test3, y3_pred))


Data shape: (1599, 12)
Columns: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']

6-class train/test shapes: (1279, 11) (320, 11)

Fitting 6-class GridSearchCV...
Fitting 3 folds for each of 72 candidates, totalling 216 fits

Best 6-class params: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.03, 'model__max_depth': 6, 'model__n_estimators': 200, 'model__subsample': 1.0}
Best 6-class CV accuracy: 0.6622192169409903

Clean best params for reuse: {'colsample_bytree': 0.8, 'learning_rate': 0.03, 'max_depth': 6, 'n_estimators': 200, 'subsample': 1.0}

3-class value counts:
quality_3class
1    1319
2     217
0      63
Name: count, dtype: int64

3-class train/test shapes: (1279, 11) (320, 11)

Fitting final 3-class XGBoost...

=== 3-CLASS XGBOOST RESULTS (Low / Medium / High) ===
3-class accuracy: 0.86875

Classification report:
               prec