# An improved analysis of the wine dataset

In [None]:
## Agenda (largely interchangeable based on nature of dataset)

 -  Reading data into workspace
 -  Explore the data structure 
 - Exploratory Data Analysis
 * Preprocessing
    - Check for and resolve nulls?
    - Encode(if necessary)
    - Standardization of data?
    - Feature Engineering
    - Oversampling
 - Modelling, Prediction and Refinement


1. Reading data into workspace

In [None]:
#Import required dependenceies
import pandas as pd
import numpy as np

In [None]:
#Read in the data
data = pd.read_csv("/kaggle/input/wine-quality-dataset/WineQT.csv")

#Create working copy
df = data.copy()

2. Explore the data structure

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
#Check the shape of the dataset
df.shape

In [None]:
df.info()

According to this, all data values are numeric, therefore no label encoding is necessary

3. Checking for and resolving nulls

In [None]:
df.isna().any()

 According to this, no nulls are present, data is relatively clean.


4. Exploratory Data Analysis
- Univariate Analysis
- Multi-variate analysis

In [None]:
#import required dependencies
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from scipy import stats


In [None]:
#Using msno to illustrate completeness of data
msno.bar(df, figsize=(16, 5), color="green")
plt.show()

No missing values as previously indicated.

 - Univariate analysis
 Under univariate analysis, we have the frequency distribution plots, the boxplots, and features like skewness and kurtosis.

In [None]:
#Create a list of all the columns
col_list = list(df.columns.values)
col_list

In [None]:
#Drop the Id and quality columns as these are not continuous columns
col_list.remove("Id")
col_list.remove("quality")

col_list

In [None]:
#Carry out uni-variate analysis on columns in the list
import warnings
warnings.filterwarnings('ignore')

#Create figure
fig,ax = plt.subplots(11,3,figsize=(30,90))
for index,i in enumerate(col_list):
    sns.distplot(df[i],ax=ax[index,0],color='green')
    sns.boxplot(df[i],ax=ax[index,1],color='yellow')
    stats.probplot(df[i],plot=ax[index,2])
    
fig.tight_layout()
plt.suptitle("Uni-Variate Analysis of continuous variables")

In [None]:
#For the discrete column:
df['quality'].value_counts().plot(kind='bar', title='Quality')

 - Multi-variate analysis 
 
 This includes the pairplot and correlation

In [None]:
#The pairplot
sns.pairplot(df[df.columns[:-1]])
plt.show()

In [None]:
#Correlation matrix
correlation_matrix = df.corr()
print(correlation_matrix['quality'].sort_values(ascending=False))

In [None]:
#Split features and target
X = df.drop('quality', axis=1)
y = df['quality']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#Split the dataset into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

5. Preprocessing
 - Oversampling
 - Standardisation?
 - Hyperparameter tuning?
 - Other transformative actions

 - Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
#Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
    

 - Oversampling

In [None]:
#Implement oversampling 
from imblearn.over_sampling import SMOTE

In [None]:
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

- Hyperparameter tuning with optuna

In [None]:
#Import choice models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import optuna
from sklearn.metrics import accuracy_score

In [None]:
#Define functions for objective and hyperparameter tuning w/ optuna
def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test_scaled)
    
    return -accuracy_score(y_test, y_pred)  # Minimize negative accuracy

def objective_svm(trial):
    C = trial.suggest_loguniform('C', 1e-3, 1e3)
    gamma = trial.suggest_loguniform('gamma', 1e-4, 1e1)
    
    model = SVC(C=C, gamma=gamma, random_state=42)
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test_scaled)
    
    return -accuracy_score(y_test, y_pred)

def objective_lr(trial):
    C = trial.suggest_loguniform('C', 1e-3, 1e3)
    
    model = LogisticRegression(C=C, random_state=42)
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test_scaled)
    
    return -accuracy_score(y_test, y_pred)


In [None]:
#Hyperparameter tuning w/ optuna
study_rf = optuna.create_study(direction='minimize')
study_rf.optimize(objective_rf, n_trials=100)

study_svm = optuna.create_study(direction='minimize')
study_svm.optimize(objective_svm, n_trials=100)

study_lr = optuna.create_study(direction='minimize')
study_lr.optimize(objective_lr, n_trials=100)


In [None]:
#Print best hyperparameters and accuracy
print("Random Forest - Best trial:")
print(study_rf.best_trial.params)
print("Accuracy:", -study_rf.best_value)

print("SVM - Best trial:")
print(study_svm.best_trial.params)
print("Accuracy:", -study_svm.best_value)

print("Linear Regression - Best trial:")
print(study_lr.best_trial.params)
print("Accuracy:", -study_lr.best_value)


In [None]:
print("X_train_scaled shape:", X_train_scaled.shape)
print("y_train_resampled shape:", y_train_resampled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)

- Final modelling and predictions 

In [None]:
best_rf_params = study_rf.best_params
best_svm_params = study_svm.best_params
best_lr_params = study_lr.best_params

In [None]:
# Initialize models with best parameters
best_rf_model = RandomForestClassifier(**best_rf_params, random_state=42)
best_svm_model = SVC(**best_svm_params, random_state=42)
best_lr_model = LogisticRegression(**best_lr_params, random_state=42)

In [None]:
best_rf_model.fit(X_train_resampled, y_train_resampled)
best_svm_model.fit(X_train_resampled, y_train_resampled)
best_lr_model.fit(X_train_resampled, y_train_resampled)

In [None]:
final_rf_preds = best_rf_model.predict(X_test_scaled)
final_svm_preds = best_svm_model.predict(X_test_scaled)
final_lr_preds = best_lr_model.predict(X_test_scaled)

In [None]:
from sklearn.metrics import classification_report

# Generate classification reports
rf_classification_report = classification_report(y_test, final_rf_preds)
svm_classification_report = classification_report(y_test, final_svm_preds)
lr_classification_report = classification_report(y_test, final_lr_preds)

# Print classification reports
print("Random Forest Classification Report:\n", rf_classification_report)
print("SVM Classification Report:\n", svm_classification_report)
print("Linear Regression Classification Report:\n", lr_classification_report)