In [None]:
!pip install xgboost

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

In [2]:
# white and red wine datasets
white_wine_data = pd.read_csv('C:/Users/jasmi/Projects/Wine-Wizardry/wine-quality White.csv')
red_wine_data = pd.read_csv('C:/Users/jasmi/Projects/Wine-Wizardry/wine-quality Red.csv')

# Display first few rows 
white_wine_data.head(), red_wine_data.head()
print("Red Wine Dataset Features:", red_wine_data.shape[1] - 1)

Red Wine Dataset Features: 11


In [3]:
# define features and target
white_wine_features = white_wine_data.drop(columns=['quality'])
white_wine_target = white_wine_data['quality']

red_wine_features = red_wine_data.drop(columns=['quality'])
red_wine_target = red_wine_data['quality']





In [4]:
# the scaler
scaler = StandardScaler()

# # Scale separately Richard
# white_wine_scaled = pd.DataFrame(scaler.fit_transform(white_wine_features), columns=white_wine_features.columns)
# red_wine_scaled = pd.DataFrame(scaler.fit_transform(red_wine_features), columns=red_wine_features.columns)

In [5]:
# Add a 'wine_type' to distinguish red and white wines
white_wine_data['wine_type'] = 1
red_wine_data['wine_type'] = 0
# white_wine_data['wine_type']
# red_wine_data['wine_type']


# combine
wine_data = pd.concat([white_wine_data, red_wine_data], axis=0)
X = wine_data.drop(columns=['quality'])
y = wine_data['quality']

# # separate features and target
# X = white_wine_data.drop(columns=['quality'])
# y = white_wine_data['quality']

# X = red_wine_data.drop(columns=['quality'])
# y = red_wine_data['quality']
# y_adjusted = y - 3



In [6]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y_adjusted, test_size=0.2, random_state=42)

In [7]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
pca = PCA(n_components=10)  
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print(pca.explained_variance_ratio_)

[0.31805485 0.20946518 0.13017146 0.08002337 0.0604672  0.05163174
 0.04487686 0.04256978 0.02956127 0.02148887]


In [9]:
rf_model=RandomForestClassifier()
# rf_model=RandomForestClassifier(class_weight='balanced')

In [10]:
param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train_pca, y_train)

best_rf_model = grid_search.best_estimator_

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [11]:


# train random forest
# rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
# rf_model.fit(X_train_pca, y_train)

# predict
y_pred = best_rf_model.predict(X_test_pca)
# y_pred = rf_model.predict(X_test_pca)

In [12]:
# accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, conf_matrix

(0.6907692307692308,
 array([[  0,   0,   2,   0,   0,   0],
        [  0,   6,  25,  15,   0,   0],
        [  1,   1, 314, 103,   1,   0],
        [  0,   0,  89, 459,  31,   0],
        [  0,   0,   5, 104, 111,   1],
        [  0,   0,   0,  12,  12,   8]], dtype=int64))

In [None]:
# feature importance
feature_importances = best_rf_model.feature_importances_
features = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=features)
plt.title('Which feature had most impact in quality')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
ax = sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[3, 4, 5, 6, 7, 8], yticklabels=[3, 4, 5, 6, 7, 8])
ax.set_title('Confusion Matrix')
ax.set_xlabel('Predicted Quality')
ax.set_ylabel('True Quality')
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(j + 0.5, i + 0.5, conf_matrix[i, j], ha='center', va='center', color='black')
plt.tight_layout()
plt.show()

In [None]:
# xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# # Train the model
# xgb_model.fit(X_train, y_train)

# # prediction
# y_pred = xgb_model.predict(X_test)
# y_pred_original = y_pred + 3

# # evaluate accuracy, confusion Matrix, and classification report
# accuracy = accuracy_score(y_test + 3, y_pred_original)
# conf_matrix = confusion_matrix(y_test + 3, y_pred_original)
# classification_rep = classification_report(y_test + 3, y_pred_original)

# accuracy, conf_matrix, classification_rep

In [None]:
plt.figure(figsize=(8,6))
ax = sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[3, 4, 5, 6, 7, 8], yticklabels=[3, 4, 5, 6, 7, 8])
ax.set_title('Confusion Matrix')
ax.set_xlabel('Predicted Quality')
ax.set_ylabel('True Quality')
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(j + 0.5, i + 0.5, conf_matrix[i, j], ha='center', va='center', color='black')
plt.tight_layout()
plt.show()