In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# white and red wine datasets
white_wine_data = pd.read_csv('C:/Users/jasmi/Projects/Wine-Wizardry/wine-quality White.csv')
red_wine_data = pd.read_csv('C:/Users/jasmi/Projects/Wine-Wizardry/wine-quality Red.csv')

# Display first few rows 
white_wine_data.head(), red_wine_data.head()

In [None]:
# Add 'wine_type' to distinguish between red and white
# white_wine_data['wine_type'] = 1
# red_wine_data['wine_type'] = 0



X = red_wine_data.drop(columns=['quality'])
y = red_wine_data['quality']


In [None]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y_adjusted, test_size=0.2, random_state=42)

In [None]:
# the scaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
pca = PCA(n_components=9)  
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
pca_components_df = pd.DataFrame(pca.components_, columns=X.columns)
print(pca.explained_variance_ratio_)
print(X_train_pca.shape)
print(pca_components_df)

In [None]:
# Manually add annotations for each cell
plt.figure(figsize=(15, 10))
ax = sns.heatmap(pca_components_df, annot=False, fmt=".2f", cmap='coolwarm', xticklabels=X.columns, yticklabels=[f'PC{i+1}' for i in range(len(pca.explained_variance_ratio_))])

# Manually add annotations using plt.text
for i in range(pca_components_df.shape[0]):
    for j in range(pca_components_df.shape[1]):
        plt.text(j+0.5, i+0.5, f'{pca_components_df.values[i,j]:.2f}', ha='center', va='center', color='black')

plt.title('Contribution of Original Features to Principal Components')
plt.show()

In [None]:
# Apply SMOTE for class imbalance in combined dataset
smote = SMOTE(random_state=42, k_neighbors=2)  # Set k_neighbors to a lower value
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_pca, y_train)

In [None]:
rf_model = RandomForestClassifier(bootstrap=False, class_weight='balanced_subsample', max_depth=20, max_features='log2', min_samples_leaf=1, 
                                  min_samples_split=2, n_estimators=500, random_state=42)
# param_grid = {
#     'n_estimators': [200, 300, 400, 500],
#     'max_depth': [None, 10, 20, 30, 40],
#     'min_samples_split': [2, 5, 10, 20],
#     'min_samples_leaf': [1, 2, 4, 10],
#     'max_features': ['auto', 'sprt', 'log2'],
#     'bootstrap': [True, False],
#     'class_weight': ['balance', 'balanced_subsample']
# }

# grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)
# grid_search.fit(X_train_pca, y_train)

# best_rf_model = grid_search.best_estimator_

In [None]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

In [None]:


# # train random forest
# rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_pca, y_train)

# predict
y_pred = best_rf_model.predict(X_test_pca)
# y_pred = rf_model.predict(X_test_pca)

In [None]:
# accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))
accuracy

In [None]:
plt.figure(figsize=(8,6))
ax = sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[3, 4, 5, 6, 7, 8], yticklabels=[3, 4, 5, 6, 7, 8])
ax.set_title('Confusion Matrix')
ax.set_xlabel('Predicted Quality')
ax.set_ylabel('True Quality')
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(j + 0.5, i + 0.5, conf_matrix[i, j], ha='center', va='center', color='black')
plt.tight_layout()
plt.show()