In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

In [None]:
#loading data
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
red_wine <- read_csv2(file = url, col_names = TRUE)
red_wine

url2 <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
white_wine <- read_csv2(file = url2, col_names = TRUE)
white_wine

Selecting our parameters

We could start by creating a scatterplot of each variable against the quality/classification metric we've created and see which variables have the strongest relationship as an indicator of what parameters will be most important for our model.

A more precise approach would be using feature selection, which will rank the most relevant variables to prediting quality

In [None]:
## Data Cleaning ##
colnames(red_wine) <- make.names(colnames(red_wine))

colnames(white_wine) <- make.names(colnames(white_wine))

red_wine$chlorides <- as.numeric(as.character(red_wine$chlorides))
red_wine$volatile.acidity <- as.numeric(as.character(red_wine$volatile.acidity))
red_wine$citric.acid <- as.numeric(as.character(red_wine$citric.acid))
red_wine$densiy <- as.numeric(as.character(red_wine$density))
red_wine$sulphates <- as.numeric(as.character(red_wine$sulphates))


white_wine$chlorides <- as.numeric(as.character(white_wine$chlorides))
white_wine$volatile.acidity <- as.numeric(as.character(white_wine$volatile.acidity))
white_wine$citric.acid <- as.numeric(as.character(white_wine$citric.acid))
white_wine$residual.sugar <- as.numeric(as.character(white_wine$residual.sugar))
white_wine$density <- as.numeric(as.character(white_wine$density))
white_wine$sulphates <- as.numeric(as.character(white_wine$sulphates))

# Extracting the columns we want
redwine <- red_wine %>%
select(..., ..., ..., quality)

whitewine <- white_wine %>%
select(..., ..., ..., quality)

# Making the quality binary
white_wine_binary <- whitewine %>%
    mutate(binary_quality = if_else(quality < 6, 0, 1))

red_wine_binary <- redwine %>%
    mutate(binary_quality = if_else(quality < 6, 0, 1))

white_wine_binary
red_wine_binary

In [None]:
# splitting data into training and testing
redwine_split <- initial_split(red_wine_binary, prop = 0.74, strata = quality)
redwine_train <- training(redwine_split)
redwine_test <- testing(redwine_split)

whitewine_split <- initial_split(white_wine_binary, prop = 0.74, strata = quality)
whitewine_train <- training(whitewine_split)
whitewine_test <- testing(whitewine_split)

In [None]:
## Visualizing binary quality distributions
#white_wine_plot <- ggplot(white_wine, aes(x=quality)) + geom_histogram(binwidth=0.5)
#white_wine_plot

#white_wine_plot_binary <- ggplot(white_wine_binary, aes(x=binary_quality)) + geom_histogram(binwidth=0.5)
#white_wine_plot_binary

#red_wine_plot <- ggplot(red_wine, aes(x=quality)) + geom_histogram(binwidth=0.5)
#red_wine_plot

#red_wine_plot_binary <- ggplot(red_wine_binary, aes(x=binary_quality)) + geom_histogram(binwidth=0.5)
#red_wine_plot_binary

In [None]:
# Visualizing distributions with three categorical variables
#white_wine_filtered <- white_wine %>%
#    filter(quality >= 5, quality <= 7)

#red_wine_filtered <- red_wine %>%
#    filter(quality >= 5, quality <= 7)

#white_wine_filtered_plot <- ggplot(white_wine_filtered, aes(x=quality)) + geom_histogram(binwidth=0.5)
#white_wine_filtered_plot

#red_wine_filtered_plot <- ggplot(red_wine_filtered, aes(x=quality)) + geom_histogram(binwidth=0.5)
#red_wine_filtered_plot

In [None]:
## Creating summmary tables and visualizations
summary(redwine_train)
redwine_train_summary <- do.call(cbind, lapply(redwine_train, summary))

summary(whitewine_train)
whitewine_train_summary <- do.call(cbind, lapply(whitewine_train, summary))


...plot <- ggplot(whitewine_train, aes(x = binary_quality, y = ...)) + 
                         geom_bar(stat = "identity") +
                             xlab("Quality") +
                             ylab("...")