In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [384]:
# Load data
df_white = pd.read_csv('winequality-white.csv', delimiter=';')
df_red = pd.read_csv('winequality-red.csv')

df = pd.concat([df_white, df_red])

# Create a quality label (low, medium, high) based on quality score
df['quality_label'] = df['quality'].apply(lambda x: 'Poor' if x < 5 else 'Average' if x <= 7 else 'High')

# Encode 'quality_label' to numerical values
label_enc = LabelEncoder()
df['quality_label'] = label_enc.fit_transform(df['quality_label'])

df = df.drop_duplicates()

# Separate the data into features and target
X = df.drop(['quality', 'quality_label'], axis=1).to_numpy()
y = df['quality_label'].to_numpy()

In [385]:
# RandomForest
model = RandomForestClassifier(min_samples_split=2, max_depth=20, n_estimators=100, random_state=42)
std_scaler = StandardScaler()
rb_scaler = RobustScaler() # robust works slightly better
sampler = SMOTE(random_state=42)

# Apply SMOTE
X_samp, y_samp = sampler.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.3, random_state=42)

# Scale the data after split to prevent information leak
X_train = rb_scaler.fit_transform(X_train)
X_test = rb_scaler.transform(X_test)

In [386]:
# GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [387]:
print('Encoded Quality Labels:\n', dict(zip(label_enc.classes_, label_enc.transform(label_enc.classes_))))
print('\n', classification_report(y_test, best_model.predict(X_test)))

Encoded Quality Labels:
 {'Average': 0, 'High': 1, 'Poor': 2}

               precision    recall  f1-score   support

           0       0.97      0.88      0.92      1501
           1       0.93      0.99      0.96      1491
           2       0.95      0.98      0.97      1445

    accuracy                           0.95      4437
   macro avg       0.95      0.95      0.95      4437
weighted avg       0.95      0.95      0.95      4437



In [388]:
cv_scores = cross_val_score(best_model, X_samp, y_samp, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

Cross-validation scores: [0.94421907 0.93914807 0.91274941 0.93473115 0.84173148]
Mean cross-validation score: 0.9145158358376759
