# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from statistics import stdev

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load Data

In [None]:
df = pd.read_csv("/mnt/hdd/Datasets/winequality-red.csv")
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.quality.value_counts()

# EDA

In [None]:
df["quality"] = df["quality"].map({
    3: "Low",
    4: "Low",
    5: "Medium",
    6: "Medium",
    7: "High",
    8: "High"
})

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df.head()

In [None]:
def plot_count(df, col, title):
    f, ax = plt.subplots(1, 2, figsize=(12, 6))
    plt.subplots_adjust(wspace=0.2)

    values = df[col].value_counts()
    N = len(values)

    outer_pie = values
    inner_pie = values / N

    ax[0].pie(
        outer_pie, 
        labels=values.index.tolist(), 
        startangle=90,
        frame=True,
        radius=1.3, 
        explode=([0.05] * (N-1) + [.3]),
        wedgeprops={'linewidth' : 1, 'edgecolor' : 'white'}, 
        textprops={'fontsize': 12, 'weight': 'bold'}
    )
    
    ax[0].pie(
        inner_pie,
        radius=1, 
        startangle=90,
        autopct='%1.f%%', 
        explode=([0.1] * (N-1) + [.3]),
        pctdistance=0.8, textprops={"size": 13, "weight": "bold", "color": "white"}
    )

    center_circle = plt.Circle((0,0), .70, color='black', fc='white', linewidth=0)
    ax[0].add_artist(center_circle)

    sns.barplot(x=values, y=values.index.tolist(), orient='horizontal')

    for i, v in enumerate(values):
        ax[1].text(v, i+0.1, str(v), color="black", fontweight="bold", fontsize=13)

    plt.setp(ax[1].get_yticklabels(), fontweight="bold")
    plt.setp(ax[1].get_xticklabels(), fontweight="bold")
    ax[1].set_xlabel(col, fontweight="bold", color='black')
    ax[1].set_ylabel('count', fontweight="bold", color='black')

    f.suptitle(f'{title}', fontsize=18, fontweight='bold')
    plt.tight_layout()
    plt.show()

In [None]:
plot_count(df, "quality", "Target Variable Distribution")

In [None]:
df.head()

In [None]:
numerical_columns = [col for col in df.columns if df[col].dtype != "object"]
categorical_columns = [col for col in df.columns if df[col].dtype == "object"]

In [None]:
plt.figure(figsize=(16, len(numerical_columns) * 2.5))

for idx, column in enumerate(numerical_columns):
    plt.subplot(len(numerical_columns), 2, idx + 1)
    sns.histplot(x=column, hue="quality", data=df, bins=30, kde=True)
    plt.title(f"{column} Distribution for quality")
    plt.ylim(0, df[column].value_counts().max() + 10)

plt.tight_layout()
plt.show()

In [None]:
def boxplots_custom(df, columns, rows, cols, title):
    fig, ax = plt.subplots(rows, cols, sharey=True, figsize=(13, 5))
    fig.suptitle(title, y=1, size=25)
    ax = ax.flatten()
    for i, column in enumerate(columns):
        sns.boxplot(data=df[column], orient="h", ax=ax[i])
        ax[i].set_title(column + ", skewness is: " + str(round(df[column].skew(axis=0, skipna=True), 2)))

    plt.tight_layout()
    plt.show()

In [None]:
boxplots_custom(df=df, columns=numerical_columns, rows=3, cols=4, title="Boxplots for each variable")

In [None]:
def IQR_method(df, n, columns):
    outlier_list = []
    
    for column in columns:
        Q1 = np.percentile(df[column], 25)
        Q3 = np.percentile(df[column],75)
        IQR = Q3 - Q1
        outlier_step = 1.5 * IQR
        outlier_list_column = df[(df[column] < Q1 - outlier_step) | (df[column] > Q3 + outlier_step )].index
        outlier_list.extend(outlier_list_column)
        
    outlier_list = Counter(outlier_list)        
    multiple_outliers = list( k for k, v in outlier_list.items() if v > n )

    df1 = df[df[column] < Q1 - outlier_step]
    df2 = df[df[column] > Q3 + outlier_step]
    print('Total number of outliers is:', df1.shape[0]+df2.shape[0])
    return multiple_outliers

In [None]:
outliers_IQR = IQR_method(df, 1, numerical_columns)

In [None]:
df = df.drop(outliers_IQR, axis=0).reset_index(drop=True)

In [None]:
plt.figure(figsize=(11, 11))
df_corr = df.copy()
df_corr["quality"] = df_corr["quality"].map({"Low": 0, "Medium": 1, "High": 2})
corr = df_corr.corr()
mask = np.triu(np.ones_like(corr))
sns.heatmap(corr, mask=mask, robust=True, center=0, square=True, cmap="viridis", linewidths=.6)
plt.title("Correlation Table")
plt.show()

In [None]:
plt.figure(figsize=(11, 11))
clustermap = sns.clustermap(corr, vmin=-1, vmax=1, annot=True)

In [None]:
plt.figure(figsize=(14, 8))
d = df_corr.corr()["quality"][:-1].abs().sort_values().plot(kind="bar", title="Highly Correlated Features with quality")

In [None]:
numerical_columns

In [None]:
vif_cols = ["volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide",
            "total sulfur dioxide"]
vif_data = pd.DataFrame()
vif_data["Feature"] = vif_cols
vif_data["VIF"] = [variance_inflation_factor(df[vif_cols].values, i) for i in range(len(vif_cols))]
print(vif_data)

In [None]:
df[vif_cols + ["quality"]].head()

In [None]:
X = df[vif_cols]
y = df["quality"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
Counter(y_train)

In [None]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
Counter(y_train)

In [None]:
ss = StandardScaler()
ss.fit(X)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=False)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
score = cross_val_score(rf, X_train, y_train, cv=kf, scoring="accuracy")
rf_cv_score = score.mean()
rf_cv_stdev = stdev(score)

print("Cross Validation Accuracy Scores:", score)
print("Average Accuracy Score:", rf_cv_score)
print("Standard Deviation Score:", rf_cv_stdev)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
params = {
    "n_estimators": [130, 150, 170, 190, 200],
    "max_depth": [8, 10, 12, 14],
    "min_samples_split": [3, 4, 5, 6],
    "min_samples_leaf": [1, 2, 3]
}

grid_rf = GridSearchCV(rf, param_grid=params, cv=kf, scoring="accuracy").fit(X_train, y_train)

In [None]:
print("Best parameters:", grid_rf.best_params_)
print("Best score:", grid_rf.best_score_)

In [None]:
rf_tuned = RandomForestClassifier(max_depth=14, min_samples_leaf=1, min_samples_split=3, n_estimators=200)
rf_tuned.fit(X_train, y_train)

In [None]:
score_tuned = cross_val_score(rf_tuned, X_train, y_train, cv=kf, scoring="accuracy")
rf_tuned_cv_score = score_tuned.mean()
rf_tuned_cv_stdev = stdev(score_tuned)

print("Cross Validation Accuracy Scores:", score_tuned)
print("Average Accuracy Score:", rf_tuned_cv_score)
print("Standard Deviation Score:", rf_tuned_cv_stdev)

In [None]:
y_pred_tuned = rf_tuned.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_tuned)